Dockerized Scraper

- Implements Dockerized Version of Scraper
- Atomized tags and categories columns
This commit is contained in:
quorploop 2025-12-20 20:55:04 +01:00
parent 7cc3d1b7e4
commit bcd210ce01
5 changed files with 201 additions and 42 deletions

View file

@ -7,9 +7,21 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
COPY .env .
RUN apt update -y RUN apt update -y
RUN apt install -y cron RUN apt install -y cron locales
COPY crontab .
RUN crontab crontab
COPY main.py . COPY main.py .
ENV PYTHONUNBUFFERED=1
ENV LANG=de_DE.UTF-8
ENV LC_ALL=de_DE.UTF-8
# Create cron job that runs every 15 minutes with environment variables
RUN echo "*/10 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
RUN chmod 0644 /etc/cron.d/knack-scraper
RUN crontab /etc/cron.d/knack-scraper
# Start cron in foreground
CMD ["cron", "-f"]

View file

@ -0,0 +1,18 @@
Knack-Scraper does exacly what its name suggests it does.
Knack-Scraper scrapes knack.news and writes to an sqlite
database for later usage.
## Example for .env
```
NUM_THREADS=8
NUM_SCRAPES=100
DATABASE_LOCATION='./data/knack.sqlite'
```
## Run once
```
python main.py
```

View file

@ -1 +0,0 @@
5 4 * * * python /app/main.py

183
main.py
View file

@ -1,25 +1,34 @@
#! python3 #! python3
import locale
import logging import logging
import os import os
import sqlite3 import sqlite3
import sys
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
import sys
from dotenv import load_dotenv
import pandas as pd import pandas as pd
import requests import requests
import tqdm import tqdm
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger("knack-scraper") load_dotenv()
# ch = logging.StreamHandler()
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# ch.setFormatter(formatter)
# ch.setLevel(logging.INFO)
# logger.addHandler(ch)
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
logging_level = logging.INFO
else:
logging_level = logging.DEBUG
logging.basicConfig(
level=logging_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-scraper")
def table_exists(tablename: str, con: sqlite3.Connection): def table_exists(tablename: str, con: sqlite3.Connection):
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
@ -39,19 +48,16 @@ def download(id: int):
if not (200 <= res.status_code <= 300): if not (200 <= res.status_code <= 300):
return return
logger.info("Found promising page with id %d!", id) logger.debug("Found promising page with id %d!", id)
content = res.content content = res.content
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
date_format = "%d. %B %Y"
# TODO FIXME: this fails inside the docker container
locale.setlocale(locale.LC_TIME, "de_DE")
pC = soup.find("div", {"class": "postContent"}) pC = soup.find("div", {"class": "postContent"})
if pC is None: if pC is None:
# not a normal post # not a normal post
logger.info( logger.debug(
"Page with id %d does not have a .pageContent-div. Skipping for now.", id "Page with id %d does not have a .pageContent-div. Skipping for now.", id
) )
return return
@ -63,9 +69,13 @@ def download(id: int):
# these fields are possible but not required # these fields are possible but not required
# TODO: cleanup # TODO: cleanup
try: try:
date_string = pC.find("span", {"class": "singledate"}).text date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
parsed_date = datetime.strptime(date_string, date_format) day = int(date_parts[0][:-1])
except AttributeError: months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
month = months[date_parts[1]]
year = int(date_parts[2])
parsed_date = datetime(year, month, day)
except Exception:
parsed_date = None parsed_date = None
try: try:
@ -75,7 +85,7 @@ def download(id: int):
try: try:
category = pC.find("span", {"class": "categoryInfo"}).find_all() category = pC.find("span", {"class": "categoryInfo"}).find_all()
category = [c.text for c in category] category = [c.text for c in category if c.text != 'Alle Artikel']
category = ";".join(category) category = ";".join(category)
except AttributeError: except AttributeError:
category = None category = None
@ -129,15 +139,79 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
# sqlite can't handle lists so let's convert them to a single row csv # sqlite can't handle lists so let's convert them to a single row csv
# TODO: make sure our database is properly normalized # TODO: make sure our database is properly normalized
df = pd.DataFrame(res) postdf = pd.DataFrame(res)
tagdf = None
posttotagdf = None
categorydf = None
postcategorydf = None
return df # Extract and create tags dataframe
if not postdf.empty and 'tags' in postdf.columns:
# Collect all unique tags
all_tags = set()
for tags_str in postdf['tags']:
if pd.notna(tags_str):
tags_list = [tag.strip() for tag in tags_str.split(';')]
all_tags.update(tags_list)
# Create tagdf with id and text columns
if all_tags:
all_tags = sorted(list(all_tags))
tagdf = pd.DataFrame({
'id': range(len(all_tags)),
'tag': all_tags
})
# Create posttotagdf mapping table
rows = []
for post_id, tags_str in zip(postdf['id'], postdf['tags']):
if pd.notna(tags_str):
tags_list = [tag.strip() for tag in tags_str.split(';')]
for tag_text in tags_list:
tag_id = tagdf[tagdf['tag'] == tag_text]['id'].values[0]
rows.append({'post_id': post_id, 'tag_id': tag_id})
if rows:
posttotagdf = pd.DataFrame(rows)
# Extract and create categories dataframe
if not postdf.empty and 'category' in postdf.columns:
# Collect all unique categories
all_categories = set()
for category_str in postdf['category']:
if pd.notna(category_str):
category_list = [cat.strip() for cat in category_str.split(';')]
all_categories.update(category_list)
# Create categorydf with id and category columns
if all_categories:
all_categories = sorted(list(all_categories))
categorydf = pd.DataFrame({
'id': range(len(all_categories)),
'category': all_categories
})
# Create postcategorydf mapping table
rows = []
for post_id, category_str in zip(postdf['id'], postdf['category']):
if pd.notna(category_str):
category_list = [cat.strip() for cat in category_str.split(';')]
for category_text in category_list:
category_id = categorydf[categorydf['category'] == category_text]['id'].values[0]
rows.append({'post_id': post_id, 'category_id': category_id})
if rows:
postcategorydf = pd.DataFrame(rows)
return postdf, tagdf, posttotagdf, categorydf, postcategorydf
def main(): def main():
num_threads = int(os.environ.get("NUM_THREADS", 8)) num_threads = int(os.environ.get("NUM_THREADS", 8))
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite") database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
con = sqlite3.connect(database_location) con = sqlite3.connect(database_location)
with con: with con:
@ -155,12 +229,77 @@ def main():
max_id_in_db = -1 max_id_in_db = -1
con = sqlite3.connect(database_location) con = sqlite3.connect(database_location)
df = run_downloads( postdf, tagdf, posttotagdf, categorydf, postcategorydf = run_downloads(
min_id=max_id_in_db + 1, min_id=max_id_in_db + 1,
max_id=max_id_in_db + n_scrapes, max_id=max_id_in_db + n_scrapes,
num_threads=num_threads, num_threads=num_threads,
) )
df.to_sql("posts", con, if_exists="append") postdf.to_sql("posts", con, if_exists="append")
# Handle tags dataframe merging and storage
if tagdf is not None and not tagdf.empty:
# Check if tags table already exists
if table_exists("tags", con):
# Read existing tags from database
existing_tagdf = pd.read_sql("SELECT id, tag FROM tags", con)
# Merge new tags with existing tags, avoiding duplicates
merged_tagdf = pd.concat([existing_tagdf, tagdf], ignore_index=False)
merged_tagdf = merged_tagdf.drop_duplicates(subset=['tag'], keep='first')
merged_tagdf = merged_tagdf.reset_index(drop=True)
merged_tagdf['id'] = range(len(merged_tagdf))
# Drop the old table and insert the merged data
con.execute("DROP TABLE tags")
con.commit()
merged_tagdf.to_sql("tags", con, if_exists="append", index=False)
# Update tag_id references in posttotagdf
if posttotagdf is not None and not posttotagdf.empty:
#tag_mapping = dict(zip(tagdf['tag'], tagdf['id']))
posttotagdf['tag_id'] = posttotagdf['tag_id'].map(
lambda old_id: merged_tagdf[merged_tagdf['tag'] == tagdf.loc[old_id, 'tag']]['id'].values[0]
)
else:
# First time creating tags table
tagdf.to_sql("tags", con, if_exists="append", index=False)
# Store posttags (post to tags mapping)
if posttotagdf is not None and not posttotagdf.empty:
posttotagdf.to_sql("posttags", con, if_exists="append", index=False)
# Handle categories dataframe merging and storage
if categorydf is not None and not categorydf.empty:
# Check if categories table already exists
if table_exists("categories", con):
# Read existing categories from database
existing_categorydf = pd.read_sql("SELECT id, category FROM categories", con)
# Merge new categories with existing categories, avoiding duplicates
merged_categorydf = pd.concat([existing_categorydf, categorydf], ignore_index=False)
merged_categorydf = merged_categorydf.drop_duplicates(subset=['category'], keep='first')
merged_categorydf = merged_categorydf.reset_index(drop=True)
merged_categorydf['id'] = range(len(merged_categorydf))
# Drop the old table and insert the merged data
con.execute("DROP TABLE categories")
con.commit()
merged_categorydf.to_sql("categories", con, if_exists="append", index=False)
# Update category_id references in postcategorydf
if postcategorydf is not None and not postcategorydf.empty:
postcategorydf['category_id'] = postcategorydf['category_id'].map(
lambda old_id: merged_categorydf[merged_categorydf['category'] == categorydf.loc[old_id, 'category']]['id'].values[0]
)
else:
# First time creating categories table
categorydf.to_sql("categories", con, if_exists="append", index=False)
# Store postcategories (post to categories mapping)
if postcategorydf is not None and not postcategorydf.empty:
postcategorydf.to_sql("postcategories", con, if_exists="append", index=False)
logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,14 +1,5 @@
beautifulsoup4==4.12.2 pandas
certifi==2023.7.22 requests
charset-normalizer==3.3.0 tqdm
idna==3.4 bs4
numpy==1.26.1 dotenv
pandas==2.1.1
python-dateutil==2.8.2
pytz==2023.3.post1
requests==2.31.0
six==1.16.0
soupsieve==2.5
tqdm==4.66.1
tzdata==2023.3
urllib3==2.0.7