From bcd210ce0194b73a4f2db913e29082081f53d183 Mon Sep 17 00:00:00 2001 From: quorploop <> Date: Sat, 20 Dec 2025 20:55:04 +0100 Subject: [PATCH] Dockerized Scraper - Implements Dockerized Version of Scraper - Atomized tags and categories columns --- Dockerfile | 22 ++++-- README.md | 18 +++++ crontab | 1 - main.py | 183 +++++++++++++++++++++++++++++++++++++++++------ requirements.txt | 19 ++--- 5 files changed, 201 insertions(+), 42 deletions(-) delete mode 100644 crontab diff --git a/Dockerfile b/Dockerfile index 9c94fd6..d9752ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,9 +7,21 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -RUN apt update -y -RUN apt install -y cron -COPY crontab . -RUN crontab crontab +COPY .env . -COPY main.py . \ No newline at end of file +RUN apt update -y +RUN apt install -y cron locales + +COPY main.py . + +ENV PYTHONUNBUFFERED=1 +ENV LANG=de_DE.UTF-8 +ENV LC_ALL=de_DE.UTF-8 + +# Create cron job that runs every 15 minutes with environment variables +RUN echo "*/10 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper +RUN chmod 0644 /etc/cron.d/knack-scraper +RUN crontab /etc/cron.d/knack-scraper + +# Start cron in foreground +CMD ["cron", "-f"] \ No newline at end of file diff --git a/README.md b/README.md index e69de29..ab971fc 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,18 @@ +Knack-Scraper does exacly what its name suggests it does. +Knack-Scraper scrapes knack.news and writes to an sqlite +database for later usage. + +## Example for .env + +``` +NUM_THREADS=8 +NUM_SCRAPES=100 +DATABASE_LOCATION='./data/knack.sqlite' +``` + +## Run once + +``` +python main.py +``` + diff --git a/crontab b/crontab deleted file mode 100644 index 6b6ae11..0000000 --- a/crontab +++ /dev/null @@ -1 +0,0 @@ -5 4 * * * python /app/main.py diff --git a/main.py b/main.py index 850ba3c..f5a0b7a 100755 --- a/main.py +++ b/main.py @@ -1,25 +1,34 @@ #! python3 -import locale import logging import os import sqlite3 -import sys import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime +import sys +from dotenv import load_dotenv import pandas as pd import requests import tqdm from bs4 import BeautifulSoup -logger = logging.getLogger("knack-scraper") -# ch = logging.StreamHandler() -# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -# ch.setFormatter(formatter) -# ch.setLevel(logging.INFO) -# logger.addHandler(ch) +load_dotenv() +if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'): + logging_level = logging.INFO +else: + logging_level = logging.DEBUG + +logging.basicConfig( + level=logging_level, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("app.log"), + logging.StreamHandler(sys.stdout) + ] +) +logger = logging.getLogger("knack-scraper") def table_exists(tablename: str, con: sqlite3.Connection): query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" @@ -39,19 +48,16 @@ def download(id: int): if not (200 <= res.status_code <= 300): return - logger.info("Found promising page with id %d!", id) + logger.debug("Found promising page with id %d!", id) content = res.content soup = BeautifulSoup(content, "html.parser") - date_format = "%d. %B %Y" - # TODO FIXME: this fails inside the docker container - locale.setlocale(locale.LC_TIME, "de_DE") pC = soup.find("div", {"class": "postContent"}) if pC is None: # not a normal post - logger.info( + logger.debug( "Page with id %d does not have a .pageContent-div. Skipping for now.", id ) return @@ -63,9 +69,13 @@ def download(id: int): # these fields are possible but not required # TODO: cleanup try: - date_string = pC.find("span", {"class": "singledate"}).text - parsed_date = datetime.strptime(date_string, date_format) - except AttributeError: + date_parts = pC.find("span", {"class": "singledate"}).text.split(' ') + day = int(date_parts[0][:-1]) + months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12} + month = months[date_parts[1]] + year = int(date_parts[2]) + parsed_date = datetime(year, month, day) + except Exception: parsed_date = None try: @@ -75,7 +85,7 @@ def download(id: int): try: category = pC.find("span", {"class": "categoryInfo"}).find_all() - category = [c.text for c in category] + category = [c.text for c in category if c.text != 'Alle Artikel'] category = ";".join(category) except AttributeError: category = None @@ -129,15 +139,79 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8): # sqlite can't handle lists so let's convert them to a single row csv # TODO: make sure our database is properly normalized - df = pd.DataFrame(res) + postdf = pd.DataFrame(res) + tagdf = None + posttotagdf = None + categorydf = None + postcategorydf = None - return df + # Extract and create tags dataframe + if not postdf.empty and 'tags' in postdf.columns: + # Collect all unique tags + all_tags = set() + for tags_str in postdf['tags']: + if pd.notna(tags_str): + tags_list = [tag.strip() for tag in tags_str.split(';')] + all_tags.update(tags_list) + + # Create tagdf with id and text columns + if all_tags: + all_tags = sorted(list(all_tags)) + tagdf = pd.DataFrame({ + 'id': range(len(all_tags)), + 'tag': all_tags + }) + + # Create posttotagdf mapping table + rows = [] + for post_id, tags_str in zip(postdf['id'], postdf['tags']): + if pd.notna(tags_str): + tags_list = [tag.strip() for tag in tags_str.split(';')] + for tag_text in tags_list: + tag_id = tagdf[tagdf['tag'] == tag_text]['id'].values[0] + rows.append({'post_id': post_id, 'tag_id': tag_id}) + + if rows: + posttotagdf = pd.DataFrame(rows) + + # Extract and create categories dataframe + if not postdf.empty and 'category' in postdf.columns: + # Collect all unique categories + all_categories = set() + for category_str in postdf['category']: + if pd.notna(category_str): + category_list = [cat.strip() for cat in category_str.split(';')] + all_categories.update(category_list) + + # Create categorydf with id and category columns + if all_categories: + all_categories = sorted(list(all_categories)) + categorydf = pd.DataFrame({ + 'id': range(len(all_categories)), + 'category': all_categories + }) + + # Create postcategorydf mapping table + rows = [] + for post_id, category_str in zip(postdf['id'], postdf['category']): + if pd.notna(category_str): + category_list = [cat.strip() for cat in category_str.split(';')] + for category_text in category_list: + category_id = categorydf[categorydf['category'] == category_text]['id'].values[0] + rows.append({'post_id': post_id, 'category_id': category_id}) + + if rows: + postcategorydf = pd.DataFrame(rows) + + return postdf, tagdf, posttotagdf, categorydf, postcategorydf def main(): num_threads = int(os.environ.get("NUM_THREADS", 8)) n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) - database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite") + database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite") + + logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}") con = sqlite3.connect(database_location) with con: @@ -155,12 +229,77 @@ def main(): max_id_in_db = -1 con = sqlite3.connect(database_location) - df = run_downloads( + postdf, tagdf, posttotagdf, categorydf, postcategorydf = run_downloads( min_id=max_id_in_db + 1, max_id=max_id_in_db + n_scrapes, num_threads=num_threads, ) - df.to_sql("posts", con, if_exists="append") + postdf.to_sql("posts", con, if_exists="append") + + # Handle tags dataframe merging and storage + if tagdf is not None and not tagdf.empty: + # Check if tags table already exists + if table_exists("tags", con): + # Read existing tags from database + existing_tagdf = pd.read_sql("SELECT id, tag FROM tags", con) + + # Merge new tags with existing tags, avoiding duplicates + merged_tagdf = pd.concat([existing_tagdf, tagdf], ignore_index=False) + merged_tagdf = merged_tagdf.drop_duplicates(subset=['tag'], keep='first') + merged_tagdf = merged_tagdf.reset_index(drop=True) + merged_tagdf['id'] = range(len(merged_tagdf)) + + # Drop the old table and insert the merged data + con.execute("DROP TABLE tags") + con.commit() + merged_tagdf.to_sql("tags", con, if_exists="append", index=False) + + # Update tag_id references in posttotagdf + if posttotagdf is not None and not posttotagdf.empty: + #tag_mapping = dict(zip(tagdf['tag'], tagdf['id'])) + posttotagdf['tag_id'] = posttotagdf['tag_id'].map( + lambda old_id: merged_tagdf[merged_tagdf['tag'] == tagdf.loc[old_id, 'tag']]['id'].values[0] + ) + else: + # First time creating tags table + tagdf.to_sql("tags", con, if_exists="append", index=False) + + # Store posttags (post to tags mapping) + if posttotagdf is not None and not posttotagdf.empty: + posttotagdf.to_sql("posttags", con, if_exists="append", index=False) + + # Handle categories dataframe merging and storage + if categorydf is not None and not categorydf.empty: + # Check if categories table already exists + if table_exists("categories", con): + # Read existing categories from database + existing_categorydf = pd.read_sql("SELECT id, category FROM categories", con) + + # Merge new categories with existing categories, avoiding duplicates + merged_categorydf = pd.concat([existing_categorydf, categorydf], ignore_index=False) + merged_categorydf = merged_categorydf.drop_duplicates(subset=['category'], keep='first') + merged_categorydf = merged_categorydf.reset_index(drop=True) + merged_categorydf['id'] = range(len(merged_categorydf)) + + # Drop the old table and insert the merged data + con.execute("DROP TABLE categories") + con.commit() + merged_categorydf.to_sql("categories", con, if_exists="append", index=False) + + # Update category_id references in postcategorydf + if postcategorydf is not None and not postcategorydf.empty: + postcategorydf['category_id'] = postcategorydf['category_id'].map( + lambda old_id: merged_categorydf[merged_categorydf['category'] == categorydf.loc[old_id, 'category']]['id'].values[0] + ) + else: + # First time creating categories table + categorydf.to_sql("categories", con, if_exists="append", index=False) + + # Store postcategories (post to categories mapping) + if postcategorydf is not None and not postcategorydf.empty: + postcategorydf.to_sql("postcategories", con, if_exists="append", index=False) + + logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}") if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 7792d83..3c59d8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,5 @@ -beautifulsoup4==4.12.2 -certifi==2023.7.22 -charset-normalizer==3.3.0 -idna==3.4 -numpy==1.26.1 -pandas==2.1.1 -python-dateutil==2.8.2 -pytz==2023.3.post1 -requests==2.31.0 -six==1.16.0 -soupsieve==2.5 -tqdm==4.66.1 -tzdata==2023.3 -urllib3==2.0.7 +pandas +requests +tqdm +bs4 +dotenv \ No newline at end of file