From 7cc3d1b7e46778df43ca7193cd2b348286b281e0 Mon Sep 17 00:00:00 2001 From: lukaszett Date: Wed, 1 Nov 2023 19:42:22 +0100 Subject: [PATCH] fix preprocessing of tags --- main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index ffd454c..850ba3c 100755 --- a/main.py +++ b/main.py @@ -74,12 +74,15 @@ def download(id: int): author = None try: - category = pC.find("span", {"class": "categoryInfo"}).find().text + category = pC.find("span", {"class": "categoryInfo"}).find_all() + category = [c.text for c in category] + category = ";".join(category) except AttributeError: category = None try: tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] + tags = ";".join(tags) except AttributeError: tags = None @@ -127,7 +130,6 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8): # sqlite can't handle lists so let's convert them to a single row csv # TODO: make sure our database is properly normalized df = pd.DataFrame(res) - df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None) return df