fix preprocessing of tags

This commit is contained in:
Lukas Z 2023-11-01 19:42:22 +01:00
parent 116cce79a4
commit 7cc3d1b7e4

View file

@ -74,12 +74,15 @@ def download(id: int):
author = None author = None
try: try:
category = pC.find("span", {"class": "categoryInfo"}).find().text category = pC.find("span", {"class": "categoryInfo"}).find_all()
category = [c.text for c in category]
category = ";".join(category)
except AttributeError: except AttributeError:
category = None category = None
try: try:
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
tags = ";".join(tags)
except AttributeError: except AttributeError:
tags = None tags = None
@ -127,7 +130,6 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
# sqlite can't handle lists so let's convert them to a single row csv # sqlite can't handle lists so let's convert them to a single row csv
# TODO: make sure our database is properly normalized # TODO: make sure our database is properly normalized
df = pd.DataFrame(res) df = pd.DataFrame(res)
df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
return df return df