Knack-Scraper/main.py

#! python3
import locale
import logging
import os
import sqlite3
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup

logger = logging.getLogger("knack-scraper")
# ch = logging.StreamHandler()
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# ch.setFormatter(formatter)
# ch.setLevel(logging.INFO)
# logger.addHandler(ch)


def table_exists(tablename: str, con: sqlite3.Connection):
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
    return len(con.execute(query, [tablename]).fetchall()) > 0


def download(id: int):
    if id == 0:
        return
    base_url = "https://knack.news/"
    url = f"{base_url}{id}"
    res = requests.get(url)

    # make sure we don't dos knack
    time.sleep(2)

    if not (200 <= res.status_code <= 300):
        return

    logger.info("Found promising page with id %d!", id)

    content = res.content
    soup = BeautifulSoup(content, "html.parser")
    date_format = "%d. %B %Y"

    # TODO FIXME: this fails inside the docker container
    locale.setlocale(locale.LC_TIME, "de_DE")
    pC = soup.find("div", {"class": "postContent"})

    if pC is None:
        # not a normal post
        logger.info(
            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
        )
        return

    # every post has these fields
    title = pC.find("h3", {"class": "postTitle"}).text
    postText = pC.find("div", {"class": "postText"})

    # these fields are possible but not required
    # TODO: cleanup
    try:
        date_string = pC.find("span", {"class": "singledate"}).text
        parsed_date = datetime.strptime(date_string, date_format)
    except AttributeError:
        parsed_date = None

    try:
        author = pC.find("span", {"class": "author"}).text
    except AttributeError:
        author = None

    try:
        category = pC.find("span", {"class": "categoryInfo"}).find_all()
        category = [c.text for c in category]
        category = ";".join(category)
    except AttributeError:
        category = None

    try:
        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
        tags = ";".join(tags)
    except AttributeError:
        tags = None

    img = pC.find("img", {"class": "postImage"})
    if img is not None:
        img = img["src"]

    res_dict = {
        "id": id,
        "title": title,
        "author": author,
        "date": parsed_date,
        "category": category,
        "url": url,
        "img_link": img,
        "tags": tags,
        "text": postText.text,
        "html": str(postText),
        "scraped_at": datetime.now(),
    }

    return res_dict


def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
    res = []

    logger.info(
        "Started parallel scrape of posts from id %d to id %d using %d threads.",
        min_id,
        max_id - 1,
        num_threads,
    )
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use a list comprehension to create a list of futures
        futures = [executor.submit(download, i) for i in range(min_id, max_id)]

        for future in tqdm.tqdm(
            futures, total=max_id - min_id
        ):  # tqdm to track progress
            post = future.result()
            if post is not None:
                res.append(post)

    # sqlite can't handle lists so let's convert them to a single row csv
    # TODO: make sure our database is properly normalized
    df = pd.DataFrame(res)

    return df


def main():
    num_threads = int(os.environ.get("NUM_THREADS", 8))
    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")

    con = sqlite3.connect(database_location)
    with con:
        post_table_exists = table_exists("posts", con)

        if post_table_exists:
            logger.info("found posts retrieved earlier")
            # retrieve max post id from db so
            # we can skip retrieving known posts
            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
            logger.info("Got max id %d!", max_id_in_db)
        else:
            logger.info("no posts scraped so far - starting from 0")
            # retrieve from 0 onwards
            max_id_in_db = -1

    con = sqlite3.connect(database_location)
    df = run_downloads(
        min_id=max_id_in_db + 1,
        max_id=max_id_in_db + n_scrapes,
        num_threads=num_threads,
    )
    df.to_sql("posts", con, if_exists="append")


if __name__ == "__main__":
    main()