initial commit

2023-10-19 22:53:36 +02:00 · 2023-10-19 22:53:36 +02:00 · 7edf451e2e
commit 7edf451e2e
6 changed files with 200 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+data/
+venv/
+.DS_STORE
--- a/15
+++ b/15
@ -0,0 +1,15 @@
+FROM python:slim
+
+RUN mkdir /app
+RUN mkdir /data
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN apt update -y
+RUN apt install -y cron
+COPY crontab .
+RUN crontab crontab
+
+COPY main.py .
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+build:
+	docker build -t knack-scraper .
--- a/1
+++ b/1
@ -0,0 +1 @@
+5 4 * * * python /app/main.py
--- a/main.py
+++ b/main.py
@ -0,0 +1,165 @@
+#! python
+import locale
+import logging
+import os
+import sqlite3
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+
+import pandas as pd
+import requests
+import tqdm
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger("knack-scraper")
+# ch = logging.StreamHandler()
+# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+# ch.setFormatter(formatter)
+# ch.setLevel(logging.INFO)
+# logger.addHandler(ch)
+
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def download(id: int):
+    if id == 0:
+        return
+    base_url = "https://knack.news/"
+    url = f"{base_url}{id}"
+    res = requests.get(url)
+
+    # make sure we don't dos knack
+    time.sleep(2)
+
+    if not (200 <= res.status_code <= 300):
+        return
+
+    logger.info("Found promising page with id %d!", id)
+
+    content = res.content
+    soup = BeautifulSoup(content, "html.parser")
+    date_format = "%d. %B %Y"
+
+    # TODO FIXME: this fails inside the docker container
+    locale.setlocale(locale.LC_TIME, "de_DE")
+    pC = soup.find("div", {"class": "postContent"})
+
+    if pC is None:
+        # not a normal post
+        logger.info(
+            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
+        )
+        return
+
+    # every post has these fields
+    title = pC.find("h3", {"class": "postTitle"}).text
+    postText = pC.find("div", {"class": "postText"})
+
+    # these fields are possible but not required
+    # TODO: cleanup
+    try:
+        date_string = pC.find("span", {"class": "singledate"}).text
+        parsed_date = datetime.strptime(date_string, date_format)
+    except AttributeError:
+        parsed_date = None
+
+    try:
+        author = pC.find("span", {"class": "author"}).text
+    except AttributeError:
+        author = None
+
+    try:
+        category = pC.find("span", {"class": "categoryInfo"}).find().text
+    except AttributeError:
+        category = None
+
+    try:
+        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
+    except AttributeError:
+        tags = None
+
+    img = pC.find("img", {"class": "postImage"})
+    if img is not None:
+        img = img["src"]
+
+    res_dict = {
+        "id": id,
+        "title": title,
+        "author": author,
+        "date": parsed_date,
+        "category": category,
+        "url": url,
+        "img_link": img,
+        "tags": tags,
+        "text": postText.text,
+        "html": str(postText),
+        "scraped_at": datetime.now(),
+    }
+
+    return res_dict
+
+
+def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
+    res = []
+
+    logger.info(
+        "Started parallel scrape of posts from id %d to id %d using %d threads.",
+        min_id,
+        max_id - 1,
+        num_threads,
+    )
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # Use a list comprehension to create a list of futures
+        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
+
+        for future in tqdm.tqdm(
+            futures, total=max_id - min_id
+        ):  # tqdm to track progress
+            post = future.result()
+            if post is not None:
+                res.append(post)
+
+    # sqlite can't handle lists so let's convert them to a single row csv
+    # TODO: make sure our database is properly normalized
+    df = pd.DataFrame(res)
+    df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
+
+    return df
+
+
+def main():
+    num_threads = int(os.environ.get("NUM_THREADS", 8))
+    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
+    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
+
+    con = sqlite3.connect(database_location)
+    with con:
+        post_table_exists = table_exists("posts", con)
+
+        if post_table_exists:
+            logger.info("found posts retrieved earlier")
+            # retrieve max post id from db so
+            # we can skip retrieving known posts
+            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
+            logger.info("Got max id %d!", max_id_in_db)
+        else:
+            logger.info("no posts scraped so far - starting from 0")
+            # retrieve from 0 onwards
+            max_id_in_db = -1
+
+    con = sqlite3.connect(database_location)
+    df = run_downloads(
+        min_id=max_id_in_db + 1,
+        max_id=max_id_in_db + n_scrapes,
+        num_threads=num_threads,
+    )
+    df.to_sql("posts", con, if_exists="append")
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+beautifulsoup4==4.12.2
+certifi==2023.7.22
+charset-normalizer==3.3.0
+idna==3.4
+numpy==1.26.1
+pandas==2.1.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+requests==2.31.0
+six==1.16.0
+soupsieve==2.5
+tqdm==4.66.1
+tzdata==2023.3
+urllib3==2.0.7