Implements Feature to cleanup authors freetext field
This commit is contained in:
parent
bcd210ce01
commit
64df8fb328
14 changed files with 804 additions and 310 deletions
29
scrape/Dockerfile
Normal file
29
scrape/Dockerfile
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
FROM python:slim
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /data
|
||||
|
||||
#COPY /data/knack.sqlite /data
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY .env .
|
||||
|
||||
RUN apt update -y
|
||||
RUN apt install -y cron locales
|
||||
|
||||
COPY main.py .
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV LANG=de_DE.UTF-8
|
||||
ENV LC_ALL=de_DE.UTF-8
|
||||
|
||||
# Create cron job that runs every 15 minutes with environment variables
|
||||
RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
|
||||
RUN chmod 0644 /etc/cron.d/knack-scraper
|
||||
RUN crontab /etc/cron.d/knack-scraper
|
||||
|
||||
# Start cron in foreground
|
||||
CMD ["cron", "-f"]
|
||||
260
scrape/main.py
Executable file
260
scrape/main.py
Executable file
|
|
@ -0,0 +1,260 @@
|
|||
#! python3
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
load_dotenv()
|
||||
|
||||
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||
logging_level = logging.INFO
|
||||
else:
|
||||
logging_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-scraper")
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def split_semicolon_list(value: str):
|
||||
if pd.isna(value):
|
||||
return []
|
||||
return [item.strip() for item in str(value).split(';') if item.strip()]
|
||||
|
||||
|
||||
def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
|
||||
"""Extract unique dimension values and post-to-dimension mappings from a column."""
|
||||
if postdf.empty or field_name not in postdf.columns:
|
||||
return None, None
|
||||
|
||||
values = set()
|
||||
mapping_rows = []
|
||||
|
||||
for post_id, raw in zip(postdf['id'], postdf[field_name]):
|
||||
items = split_semicolon_list(raw)
|
||||
for item in items:
|
||||
values.add(item)
|
||||
mapping_rows.append({'post_id': post_id, dim_col: item})
|
||||
|
||||
if not values:
|
||||
return None, None
|
||||
|
||||
dim_df = pd.DataFrame({
|
||||
'id': range(len(values)),
|
||||
dim_col: sorted(values),
|
||||
})
|
||||
map_df = pd.DataFrame(mapping_rows)
|
||||
return dim_df, map_df
|
||||
|
||||
|
||||
def store_dimension_and_mapping(
|
||||
con: sqlite3.Connection,
|
||||
dim_df: pd.DataFrame | None,
|
||||
map_df: pd.DataFrame | None,
|
||||
table_name: str,
|
||||
dim_col: str,
|
||||
mapping_table: str,
|
||||
mapping_id_col: str,
|
||||
):
|
||||
"""Persist a dimension table and its mapping table, merging with existing values."""
|
||||
if dim_df is None or dim_df.empty:
|
||||
return
|
||||
|
||||
if table_exists(table_name, con):
|
||||
existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
|
||||
merged = pd.concat([existing, dim_df], ignore_index=True)
|
||||
merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
|
||||
merged['id'] = range(len(merged))
|
||||
else:
|
||||
merged = dim_df.copy()
|
||||
|
||||
# Replace table with merged content
|
||||
merged.to_sql(table_name, con, if_exists="replace", index=False)
|
||||
|
||||
if map_df is None or map_df.empty:
|
||||
return
|
||||
|
||||
value_to_id = dict(zip(merged[dim_col], merged['id']))
|
||||
map_df = map_df.copy()
|
||||
map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
|
||||
map_df = map_df[['post_id', mapping_id_col]].dropna()
|
||||
map_df.to_sql(mapping_table, con, if_exists="append", index=False)
|
||||
|
||||
|
||||
def download(id: int):
|
||||
if id == 0:
|
||||
return
|
||||
base_url = "https://knack.news/"
|
||||
url = f"{base_url}{id}"
|
||||
res = requests.get(url)
|
||||
|
||||
# make sure we don't dos knack
|
||||
time.sleep(2)
|
||||
|
||||
if not (200 <= res.status_code <= 300):
|
||||
return
|
||||
|
||||
logger.debug("Found promising page with id %d!", id)
|
||||
|
||||
content = res.content
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
pC = soup.find("div", {"class": "postContent"})
|
||||
|
||||
if pC is None:
|
||||
# not a normal post
|
||||
logger.debug(
|
||||
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
||||
)
|
||||
return
|
||||
|
||||
# every post has these fields
|
||||
title = pC.find("h3", {"class": "postTitle"}).text
|
||||
postText = pC.find("div", {"class": "postText"})
|
||||
|
||||
# these fields are possible but not required
|
||||
# TODO: cleanup
|
||||
try:
|
||||
date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
|
||||
day = int(date_parts[0][:-1])
|
||||
months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
|
||||
month = months[date_parts[1]]
|
||||
year = int(date_parts[2])
|
||||
parsed_date = datetime(year, month, day)
|
||||
except Exception:
|
||||
parsed_date = None
|
||||
|
||||
try:
|
||||
author = pC.find("span", {"class": "author"}).text
|
||||
except AttributeError:
|
||||
author = None
|
||||
|
||||
try:
|
||||
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
||||
category = [c.text for c in category if c.text != 'Alle Artikel']
|
||||
category = ";".join(category)
|
||||
except AttributeError:
|
||||
category = None
|
||||
|
||||
try:
|
||||
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
|
||||
tags = ";".join(tags)
|
||||
except AttributeError:
|
||||
tags = None
|
||||
|
||||
img = pC.find("img", {"class": "postImage"})
|
||||
if img is not None:
|
||||
img = img["src"]
|
||||
|
||||
res_dict = {
|
||||
"id": id,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"date": parsed_date,
|
||||
"category": category,
|
||||
"url": url,
|
||||
"img_link": img,
|
||||
"tags": tags,
|
||||
"text": postText.text,
|
||||
"html": str(postText),
|
||||
"scraped_at": datetime.now(),
|
||||
"is_cleaned": False
|
||||
}
|
||||
|
||||
return res_dict
|
||||
|
||||
|
||||
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
||||
res = []
|
||||
|
||||
logger.info(
|
||||
"Started parallel scrape of posts from id %d to id %d using %d threads.",
|
||||
min_id,
|
||||
max_id - 1,
|
||||
num_threads,
|
||||
)
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# Use a list comprehension to create a list of futures
|
||||
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
|
||||
|
||||
for future in futures:
|
||||
post = future.result()
|
||||
if post is not None:
|
||||
res.append(post)
|
||||
|
||||
postdf = pd.DataFrame(res)
|
||||
return postdf
|
||||
|
||||
|
||||
def main():
|
||||
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
||||
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
||||
database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
|
||||
|
||||
logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
|
||||
|
||||
con = sqlite3.connect(database_location)
|
||||
with con:
|
||||
if table_exists("posts", con):
|
||||
logger.info("found posts retrieved earlier")
|
||||
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
|
||||
logger.info("Got max id %d!", max_id_in_db)
|
||||
else:
|
||||
logger.info("no posts scraped so far - starting from 0")
|
||||
max_id_in_db = -1
|
||||
|
||||
postdf = run_downloads(
|
||||
min_id=max_id_in_db + 1,
|
||||
max_id=max_id_in_db + n_scrapes,
|
||||
num_threads=num_threads,
|
||||
)
|
||||
|
||||
postdf.to_sql("posts", con, if_exists="append")
|
||||
|
||||
# Tags
|
||||
tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
|
||||
store_dimension_and_mapping(
|
||||
con,
|
||||
tag_dim,
|
||||
tag_map,
|
||||
table_name="tags",
|
||||
dim_col="tag",
|
||||
mapping_table="posttags",
|
||||
mapping_id_col="tag_id",
|
||||
)
|
||||
|
||||
# Categories
|
||||
category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
|
||||
store_dimension_and_mapping(
|
||||
con,
|
||||
category_dim,
|
||||
category_map,
|
||||
table_name="categories",
|
||||
dim_col="category",
|
||||
mapping_table="postcategories",
|
||||
mapping_id_col="category_id",
|
||||
)
|
||||
|
||||
logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
scrape/requirements.txt
Normal file
4
scrape/requirements.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
pandas
|
||||
requests
|
||||
bs4
|
||||
dotenv
|
||||
Loading…
Add table
Add a link
Reference in a new issue