Implements Feature to cleanup authors freetext field

2025-12-21 21:18:05 +01:00 · 2025-12-21 21:18:05 +01:00 · 64df8fb328
commit 64df8fb328
parent bcd210ce01
14 changed files with 804 additions and 310 deletions
--- a/scrape/Dockerfile
+++ b/scrape/Dockerfile
@ -0,0 +1,29 @@
+FROM python:slim
+
+RUN mkdir /app
+RUN mkdir /data
+
+#COPY /data/knack.sqlite /data
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY .env .
+
+RUN apt update -y
+RUN apt install -y cron locales
+
+COPY main.py .
+
+ENV PYTHONUNBUFFERED=1
+ENV LANG=de_DE.UTF-8
+ENV LC_ALL=de_DE.UTF-8
+
+# Create cron job that runs every 15 minutes with environment variables
+RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
+RUN chmod 0644 /etc/cron.d/knack-scraper
+RUN crontab /etc/cron.d/knack-scraper
+
+# Start cron in foreground
+CMD ["cron", "-f"]
--- a/scrape/main.py
+++ b/scrape/main.py
@ -0,0 +1,260 @@
+#! python3
+import logging
+import os
+import sqlite3
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import sys
+
+from dotenv import load_dotenv
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+load_dotenv()
+
+if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
+    logging_level = logging.INFO
+else:
+    logging_level = logging.DEBUG
+
+logging.basicConfig(
+    level=logging_level,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("knack-scraper")
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def split_semicolon_list(value: str):
+    if pd.isna(value):
+        return []
+    return [item.strip() for item in str(value).split(';') if item.strip()]
+
+
+def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
+    """Extract unique dimension values and post-to-dimension mappings from a column."""
+    if postdf.empty or field_name not in postdf.columns:
+        return None, None
+
+    values = set()
+    mapping_rows = []
+
+    for post_id, raw in zip(postdf['id'], postdf[field_name]):
+        items = split_semicolon_list(raw)
+        for item in items:
+            values.add(item)
+            mapping_rows.append({'post_id': post_id, dim_col: item})
+
+    if not values:
+        return None, None
+
+    dim_df = pd.DataFrame({
+        'id': range(len(values)),
+        dim_col: sorted(values),
+    })
+    map_df = pd.DataFrame(mapping_rows)
+    return dim_df, map_df
+
+
+def store_dimension_and_mapping(
+    con: sqlite3.Connection,
+    dim_df: pd.DataFrame | None,
+    map_df: pd.DataFrame | None,
+    table_name: str,
+    dim_col: str,
+    mapping_table: str,
+    mapping_id_col: str,
+):
+    """Persist a dimension table and its mapping table, merging with existing values."""
+    if dim_df is None or dim_df.empty:
+        return
+
+    if table_exists(table_name, con):
+        existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
+        merged = pd.concat([existing, dim_df], ignore_index=True)
+        merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
+        merged['id'] = range(len(merged))
+    else:
+        merged = dim_df.copy()
+
+    # Replace table with merged content
+    merged.to_sql(table_name, con, if_exists="replace", index=False)
+
+    if map_df is None or map_df.empty:
+        return
+
+    value_to_id = dict(zip(merged[dim_col], merged['id']))
+    map_df = map_df.copy()
+    map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
+    map_df = map_df[['post_id', mapping_id_col]].dropna()
+    map_df.to_sql(mapping_table, con, if_exists="append", index=False)
+
+
+def download(id: int):
+    if id == 0:
+        return
+    base_url = "https://knack.news/"
+    url = f"{base_url}{id}"
+    res = requests.get(url)
+
+    # make sure we don't dos knack
+    time.sleep(2)
+
+    if not (200 <= res.status_code <= 300):
+        return
+
+    logger.debug("Found promising page with id %d!", id)
+
+    content = res.content
+    soup = BeautifulSoup(content, "html.parser")
+
+    pC = soup.find("div", {"class": "postContent"})
+
+    if pC is None:
+        # not a normal post
+        logger.debug(
+            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
+        )
+        return
+
+    # every post has these fields
+    title = pC.find("h3", {"class": "postTitle"}).text
+    postText = pC.find("div", {"class": "postText"})
+
+    # these fields are possible but not required
+    # TODO: cleanup
+    try:
+        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
+        day = int(date_parts[0][:-1])
+        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
+        month = months[date_parts[1]]
+        year = int(date_parts[2])
+        parsed_date = datetime(year, month, day)
+    except Exception:
+        parsed_date = None
+
+    try:
+        author = pC.find("span", {"class": "author"}).text
+    except AttributeError:
+        author = None
+
+    try:
+        category = pC.find("span", {"class": "categoryInfo"}).find_all()
+        category = [c.text for c in category if c.text != 'Alle Artikel']
+        category = ";".join(category)
+    except AttributeError:
+        category = None
+
+    try:
+        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
+        tags = ";".join(tags)
+    except AttributeError:
+        tags = None
+
+    img = pC.find("img", {"class": "postImage"})
+    if img is not None:
+        img = img["src"]
+
+    res_dict = {
+        "id": id,
+        "title": title,
+        "author": author,
+        "date": parsed_date,
+        "category": category,
+        "url": url,
+        "img_link": img,
+        "tags": tags,
+        "text": postText.text,
+        "html": str(postText),
+        "scraped_at": datetime.now(),
+        "is_cleaned": False
+    }
+
+    return res_dict
+
+
+def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
+    res = []
+
+    logger.info(
+        "Started parallel scrape of posts from id %d to id %d using %d threads.",
+        min_id,
+        max_id - 1,
+        num_threads,
+    )
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # Use a list comprehension to create a list of futures
+        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
+
+        for future in futures:
+            post = future.result()
+            if post is not None:
+                res.append(post)
+
+    postdf = pd.DataFrame(res)
+    return postdf
+
+
+def main():
+    num_threads = int(os.environ.get("NUM_THREADS", 8))
+    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
+    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
+
+    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
+
+    con = sqlite3.connect(database_location)
+    with con:
+        if table_exists("posts", con):
+            logger.info("found posts retrieved earlier")
+            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
+            logger.info("Got max id %d!", max_id_in_db)
+        else:
+            logger.info("no posts scraped so far - starting from 0")
+            max_id_in_db = -1
+
+        postdf = run_downloads(
+            min_id=max_id_in_db + 1,
+            max_id=max_id_in_db + n_scrapes,
+            num_threads=num_threads,
+        )
+
+        postdf.to_sql("posts", con, if_exists="append")
+
+        # Tags
+        tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
+        store_dimension_and_mapping(
+            con,
+            tag_dim,
+            tag_map,
+            table_name="tags",
+            dim_col="tag",
+            mapping_table="posttags",
+            mapping_id_col="tag_id",
+        )
+
+        # Categories
+        category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
+        store_dimension_and_mapping(
+            con,
+            category_dim,
+            category_map,
+            table_name="categories",
+            dim_col="category",
+            mapping_table="postcategories",
+            mapping_id_col="category_id",
+        )
+
+        logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scrape/requirements.txt
+++ b/scrape/requirements.txt
@ -0,0 +1,4 @@
+pandas
+requests
+bs4
+dotenv