initial commit

2023-10-19 22:53:36 +02:00 · 2023-10-19 22:53:36 +02:00 · 7edf451e2e
commit 7edf451e2e
6 changed files with 200 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 data/
 venv/
 .DS_STORE
--- a/15
+++ b/15
@ -0,0 +1,15 @@
 FROM python:slim
 RUN mkdir /app
 RUN mkdir /data
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN apt update -y
 RUN apt install -y cron
 COPY crontab .
 RUN crontab crontab
 COPY main.py .
--- a/2
+++ b/2
@ -0,0 +1,2 @@
 build:
 	docker build -t knack-scraper .
--- a/1
+++ b/1
@ -0,0 +1 @@
 5 4 * * * python /app/main.py
--- a/main.py
+++ b/main.py
@ -0,0 +1,165 @@
 #! python
 import locale
 import logging
 import os
 import sqlite3
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 import pandas as pd
 import requests
 import tqdm
 from bs4 import BeautifulSoup
 logger = logging.getLogger("knack-scraper")
 # ch = logging.StreamHandler()
 # formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 # ch.setFormatter(formatter)
 # ch.setLevel(logging.INFO)
 # logger.addHandler(ch)
 def table_exists(tablename: str, con: sqlite3.Connection):
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
    return len(con.execute(query, [tablename]).fetchall()) > 0
 def download(id: int):
    if id == 0:
        return
    base_url = "https://knack.news/"
    url = f"{base_url}{id}"
    res = requests.get(url)
    # make sure we don't dos knack
    time.sleep(2)
    if not (200 <= res.status_code <= 300):
        return
    logger.info("Found promising page with id %d!", id)
    content = res.content
    soup = BeautifulSoup(content, "html.parser")
    date_format = "%d. %B %Y"
    # TODO FIXME: this fails inside the docker container
    locale.setlocale(locale.LC_TIME, "de_DE")
    pC = soup.find("div", {"class": "postContent"})
    if pC is None:
        # not a normal post
        logger.info(
            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
        )
        return
    # every post has these fields
    title = pC.find("h3", {"class": "postTitle"}).text
    postText = pC.find("div", {"class": "postText"})
    # these fields are possible but not required
    # TODO: cleanup
    try:
        date_string = pC.find("span", {"class": "singledate"}).text
        parsed_date = datetime.strptime(date_string, date_format)
    except AttributeError:
        parsed_date = None
    try:
        author = pC.find("span", {"class": "author"}).text
    except AttributeError:
        author = None
    try:
        category = pC.find("span", {"class": "categoryInfo"}).find().text
    except AttributeError:
        category = None
    try:
        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
    except AttributeError:
        tags = None
    img = pC.find("img", {"class": "postImage"})
    if img is not None:
        img = img["src"]
    res_dict = {
        "id": id,
        "title": title,
        "author": author,
        "date": parsed_date,
        "category": category,
        "url": url,
        "img_link": img,
        "tags": tags,
        "text": postText.text,
        "html": str(postText),
        "scraped_at": datetime.now(),
    }
    return res_dict
 def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
    res = []
    logger.info(
        "Started parallel scrape of posts from id %d to id %d using %d threads.",
        min_id,
        max_id - 1,
        num_threads,
    )
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use a list comprehension to create a list of futures
        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
        for future in tqdm.tqdm(
            futures, total=max_id - min_id
        ):  # tqdm to track progress
            post = future.result()
            if post is not None:
                res.append(post)
    # sqlite can't handle lists so let's convert them to a single row csv
    # TODO: make sure our database is properly normalized
    df = pd.DataFrame(res)
    df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
    return df
 def main():
    num_threads = int(os.environ.get("NUM_THREADS", 8))
    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
    con = sqlite3.connect(database_location)
    with con:
        post_table_exists = table_exists("posts", con)
        if post_table_exists:
            logger.info("found posts retrieved earlier")
            # retrieve max post id from db so
            # we can skip retrieving known posts
            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
            logger.info("Got max id %d!", max_id_in_db)
        else:
            logger.info("no posts scraped so far - starting from 0")
            # retrieve from 0 onwards
            max_id_in_db = -1
    con = sqlite3.connect(database_location)
    df = run_downloads(
        min_id=max_id_in_db + 1,
        max_id=max_id_in_db + n_scrapes,
        num_threads=num_threads,
    )
    df.to_sql("posts", con, if_exists="append")
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
 beautifulsoup4==4.12.2
 certifi==2023.7.22
 charset-normalizer==3.3.0
 idna==3.4
 numpy==1.26.1
 pandas==2.1.1
 python-dateutil==2.8.2
 pytz==2023.3.post1
 requests==2.31.0
 six==1.16.0
 soupsieve==2.5
 tqdm==4.66.1
 tzdata==2023.3
 urllib3==2.0.7