fix preprocessing of tags
This commit is contained in:
parent
116cce79a4
commit
7cc3d1b7e4
1 changed files with 4 additions and 2 deletions
6
main.py
6
main.py
|
@ -74,12 +74,15 @@ def download(id: int):
|
|||
author = None
|
||||
|
||||
try:
|
||||
category = pC.find("span", {"class": "categoryInfo"}).find().text
|
||||
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
||||
category = [c.text for c in category]
|
||||
category = ";".join(category)
|
||||
except AttributeError:
|
||||
category = None
|
||||
|
||||
try:
|
||||
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
|
||||
tags = ";".join(tags)
|
||||
except AttributeError:
|
||||
tags = None
|
||||
|
||||
|
@ -127,7 +130,6 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
|||
# sqlite can't handle lists so let's convert them to a single row csv
|
||||
# TODO: make sure our database is properly normalized
|
||||
df = pd.DataFrame(res)
|
||||
df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
|
||||
|
||||
return df
|
||||
|
||||
|
|
Loading…
Reference in a new issue