Dockerized Scraper

- Implements Dockerized Version of Scraper - Atomized tags and categories columns
2025-12-20 20:55:04 +01:00 · 2025-12-20 20:55:04 +01:00 · bcd210ce01
commit bcd210ce01
parent 7cc3d1b7e4
5 changed files with 201 additions and 42 deletions
--- a/main.py
+++ b/main.py
@ -1,25 +1,34 @@
 #! python3
-import locale
 import logging
 import os
 import sqlite3
-import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
+import sys

+from dotenv import load_dotenv
 import pandas as pd
 import requests
 import tqdm
 from bs4 import BeautifulSoup

-logger = logging.getLogger("knack-scraper")
-# ch = logging.StreamHandler()
-# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-# ch.setFormatter(formatter)
-# ch.setLevel(logging.INFO)
-# logger.addHandler(ch)
+load_dotenv()

+if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
+    logging_level = logging.INFO
+else:
+    logging_level = logging.DEBUG
+
+logging.basicConfig(
+    level=logging_level,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("knack-scraper")

 def table_exists(tablename: str, con: sqlite3.Connection):
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
@ -39,19 +48,16 @@ def download(id: int):
    if not (200 <= res.status_code <= 300):
        return

-    logger.info("Found promising page with id %d!", id)
+    logger.debug("Found promising page with id %d!", id)

    content = res.content
    soup = BeautifulSoup(content, "html.parser")
-    date_format = "%d. %B %Y"

-    # TODO FIXME: this fails inside the docker container
-    locale.setlocale(locale.LC_TIME, "de_DE")
    pC = soup.find("div", {"class": "postContent"})

    if pC is None:
        # not a normal post
-        logger.info(
+        logger.debug(
            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
        )
        return
@ -63,9 +69,13 @@ def download(id: int):
    # these fields are possible but not required
    # TODO: cleanup
    try:
-        date_string = pC.find("span", {"class": "singledate"}).text
-        parsed_date = datetime.strptime(date_string, date_format)
-    except AttributeError:
+        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
+        day = int(date_parts[0][:-1])
+        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
+        month = months[date_parts[1]]
+        year = int(date_parts[2])
+        parsed_date = datetime(year, month, day)
+    except Exception:
        parsed_date = None

    try:
@ -75,7 +85,7 @@ def download(id: int):

    try:
        category = pC.find("span", {"class": "categoryInfo"}).find_all()
-        category = [c.text for c in category]
+        category = [c.text for c in category if c.text != 'Alle Artikel']
        category = ";".join(category)
    except AttributeError:
        category = None
@ -129,15 +139,79 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8):

    # sqlite can't handle lists so let's convert them to a single row csv
    # TODO: make sure our database is properly normalized
-    df = pd.DataFrame(res)
+    postdf = pd.DataFrame(res)
+    tagdf = None
+    posttotagdf = None
+    categorydf = None
+    postcategorydf = None

-    return df
+    # Extract and create tags dataframe
+    if not postdf.empty and 'tags' in postdf.columns:
+        # Collect all unique tags
+        all_tags = set()
+        for tags_str in postdf['tags']:
+            if pd.notna(tags_str):
+                tags_list = [tag.strip() for tag in tags_str.split(';')]
+                all_tags.update(tags_list)
+        
+        # Create tagdf with id and text columns
+        if all_tags:
+            all_tags = sorted(list(all_tags))
+            tagdf = pd.DataFrame({
+                'id': range(len(all_tags)),
+                'tag': all_tags
+            })
+            
+            # Create posttotagdf mapping table
+            rows = []
+            for post_id, tags_str in zip(postdf['id'], postdf['tags']):
+                if pd.notna(tags_str):
+                    tags_list = [tag.strip() for tag in tags_str.split(';')]
+                    for tag_text in tags_list:
+                        tag_id = tagdf[tagdf['tag'] == tag_text]['id'].values[0]
+                        rows.append({'post_id': post_id, 'tag_id': tag_id})
+            
+            if rows:
+                posttotagdf = pd.DataFrame(rows)
+
+    # Extract and create categories dataframe
+    if not postdf.empty and 'category' in postdf.columns:
+        # Collect all unique categories
+        all_categories = set()
+        for category_str in postdf['category']:
+            if pd.notna(category_str):
+                category_list = [cat.strip() for cat in category_str.split(';')]
+                all_categories.update(category_list)
+        
+        # Create categorydf with id and category columns
+        if all_categories:
+            all_categories = sorted(list(all_categories))
+            categorydf = pd.DataFrame({
+                'id': range(len(all_categories)),
+                'category': all_categories
+            })
+            
+            # Create postcategorydf mapping table
+            rows = []
+            for post_id, category_str in zip(postdf['id'], postdf['category']):
+                if pd.notna(category_str):
+                    category_list = [cat.strip() for cat in category_str.split(';')]
+                    for category_text in category_list:
+                        category_id = categorydf[categorydf['category'] == category_text]['id'].values[0]
+                        rows.append({'post_id': post_id, 'category_id': category_id})
+            
+            if rows:
+                postcategorydf = pd.DataFrame(rows)
+
+    return postdf, tagdf, posttotagdf, categorydf, postcategorydf


 def main():
    num_threads = int(os.environ.get("NUM_THREADS", 8))
    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
-    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
+    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
+
+    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")

    con = sqlite3.connect(database_location)
    with con:
@ -155,12 +229,77 @@ def main():
            max_id_in_db = -1

    con = sqlite3.connect(database_location)
-    df = run_downloads(
+    postdf, tagdf, posttotagdf, categorydf, postcategorydf = run_downloads(
        min_id=max_id_in_db + 1,
        max_id=max_id_in_db + n_scrapes,
        num_threads=num_threads,
    )
-    df.to_sql("posts", con, if_exists="append")
+    postdf.to_sql("posts", con, if_exists="append")
+    
+    # Handle tags dataframe merging and storage
+    if tagdf is not None and not tagdf.empty:
+        # Check if tags table already exists
+        if table_exists("tags", con):
+            # Read existing tags from database
+            existing_tagdf = pd.read_sql("SELECT id, tag FROM tags", con)
+            
+            # Merge new tags with existing tags, avoiding duplicates
+            merged_tagdf = pd.concat([existing_tagdf, tagdf], ignore_index=False)
+            merged_tagdf = merged_tagdf.drop_duplicates(subset=['tag'], keep='first')
+            merged_tagdf = merged_tagdf.reset_index(drop=True)
+            merged_tagdf['id'] = range(len(merged_tagdf))
+            
+            # Drop the old table and insert the merged data
+            con.execute("DROP TABLE tags")
+            con.commit()
+            merged_tagdf.to_sql("tags", con, if_exists="append", index=False)
+            
+            # Update tag_id references in posttotagdf
+            if posttotagdf is not None and not posttotagdf.empty:
+                #tag_mapping = dict(zip(tagdf['tag'], tagdf['id']))
+                posttotagdf['tag_id'] = posttotagdf['tag_id'].map(
+                    lambda old_id: merged_tagdf[merged_tagdf['tag'] == tagdf.loc[old_id, 'tag']]['id'].values[0]
+                )
+        else:
+            # First time creating tags table
+            tagdf.to_sql("tags", con, if_exists="append", index=False)
+    
+    # Store posttags (post to tags mapping)
+    if posttotagdf is not None and not posttotagdf.empty:
+        posttotagdf.to_sql("posttags", con, if_exists="append", index=False)
+    
+    # Handle categories dataframe merging and storage
+    if categorydf is not None and not categorydf.empty:
+        # Check if categories table already exists
+        if table_exists("categories", con):
+            # Read existing categories from database
+            existing_categorydf = pd.read_sql("SELECT id, category FROM categories", con)
+            
+            # Merge new categories with existing categories, avoiding duplicates
+            merged_categorydf = pd.concat([existing_categorydf, categorydf], ignore_index=False)
+            merged_categorydf = merged_categorydf.drop_duplicates(subset=['category'], keep='first')
+            merged_categorydf = merged_categorydf.reset_index(drop=True)
+            merged_categorydf['id'] = range(len(merged_categorydf))
+            
+            # Drop the old table and insert the merged data
+            con.execute("DROP TABLE categories")
+            con.commit()
+            merged_categorydf.to_sql("categories", con, if_exists="append", index=False)
+            
+            # Update category_id references in postcategorydf
+            if postcategorydf is not None and not postcategorydf.empty:
+                postcategorydf['category_id'] = postcategorydf['category_id'].map(
+                    lambda old_id: merged_categorydf[merged_categorydf['category'] == categorydf.loc[old_id, 'category']]['id'].values[0]
+                )
+        else:
+            # First time creating categories table
+            categorydf.to_sql("categories", con, if_exists="append", index=False)
+    
+    # Store postcategories (post to categories mapping)
+    if postcategorydf is not None and not postcategorydf.empty:
+        postcategorydf.to_sql("postcategories", con, if_exists="append", index=False)
+
+    logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")


 if __name__ == "__main__":