Knack-Scraper/main.py

168 lines
4.6 KiB
Python
Executable File

#! python3
import locale
import logging
import os
import sqlite3
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup
logger = logging.getLogger("knack-scraper")
# ch = logging.StreamHandler()
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# ch.setFormatter(formatter)
# ch.setLevel(logging.INFO)
# logger.addHandler(ch)
def table_exists(tablename: str, con: sqlite3.Connection):
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
return len(con.execute(query, [tablename]).fetchall()) > 0
def download(id: int):
if id == 0:
return
base_url = "https://knack.news/"
url = f"{base_url}{id}"
res = requests.get(url)
# make sure we don't dos knack
time.sleep(2)
if not (200 <= res.status_code <= 300):
return
logger.info("Found promising page with id %d!", id)
content = res.content
soup = BeautifulSoup(content, "html.parser")
date_format = "%d. %B %Y"
# TODO FIXME: this fails inside the docker container
locale.setlocale(locale.LC_TIME, "de_DE")
pC = soup.find("div", {"class": "postContent"})
if pC is None:
# not a normal post
logger.info(
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
)
return
# every post has these fields
title = pC.find("h3", {"class": "postTitle"}).text
postText = pC.find("div", {"class": "postText"})
# these fields are possible but not required
# TODO: cleanup
try:
date_string = pC.find("span", {"class": "singledate"}).text
parsed_date = datetime.strptime(date_string, date_format)
except AttributeError:
parsed_date = None
try:
author = pC.find("span", {"class": "author"}).text
except AttributeError:
author = None
try:
category = pC.find("span", {"class": "categoryInfo"}).find_all()
category = [c.text for c in category]
category = ";".join(category)
except AttributeError:
category = None
try:
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
tags = ";".join(tags)
except AttributeError:
tags = None
img = pC.find("img", {"class": "postImage"})
if img is not None:
img = img["src"]
res_dict = {
"id": id,
"title": title,
"author": author,
"date": parsed_date,
"category": category,
"url": url,
"img_link": img,
"tags": tags,
"text": postText.text,
"html": str(postText),
"scraped_at": datetime.now(),
}
return res_dict
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
res = []
logger.info(
"Started parallel scrape of posts from id %d to id %d using %d threads.",
min_id,
max_id - 1,
num_threads,
)
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Use a list comprehension to create a list of futures
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
for future in tqdm.tqdm(
futures, total=max_id - min_id
): # tqdm to track progress
post = future.result()
if post is not None:
res.append(post)
# sqlite can't handle lists so let's convert them to a single row csv
# TODO: make sure our database is properly normalized
df = pd.DataFrame(res)
return df
def main():
num_threads = int(os.environ.get("NUM_THREADS", 8))
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
con = sqlite3.connect(database_location)
with con:
post_table_exists = table_exists("posts", con)
if post_table_exists:
logger.info("found posts retrieved earlier")
# retrieve max post id from db so
# we can skip retrieving known posts
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
logger.info("Got max id %d!", max_id_in_db)
else:
logger.info("no posts scraped so far - starting from 0")
# retrieve from 0 onwards
max_id_in_db = -1
con = sqlite3.connect(database_location)
df = run_downloads(
min_id=max_id_in_db + 1,
max_id=max_id_in_db + n_scrapes,
num_threads=num_threads,
)
df.to_sql("posts", con, if_exists="append")
if __name__ == "__main__":
main()