#! python3 import logging import os import sqlite3 import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime import sys from dotenv import load_dotenv import pandas as pd import requests from bs4 import BeautifulSoup load_dotenv() if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'): logging_level = logging.INFO else: logging_level = logging.DEBUG logging.basicConfig( level=logging_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("app.log"), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger("knack-scraper") def table_exists(tablename: str, con: sqlite3.Connection): query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" return len(con.execute(query, [tablename]).fetchall()) > 0 def split_semicolon_list(value: str): if pd.isna(value): return [] return [item.strip() for item in str(value).split(';') if item.strip()] def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str): """Extract unique dimension values and post-to-dimension mappings from a column.""" if postdf.empty or field_name not in postdf.columns: return None, None values = set() mapping_rows = [] for post_id, raw in zip(postdf['id'], postdf[field_name]): items = split_semicolon_list(raw) for item in items: values.add(item) mapping_rows.append({'post_id': post_id, dim_col: item}) if not values: return None, None dim_df = pd.DataFrame({ 'id': range(len(values)), dim_col: sorted(values), }) map_df = pd.DataFrame(mapping_rows) return dim_df, map_df def store_dimension_and_mapping( con: sqlite3.Connection, dim_df: pd.DataFrame | None, map_df: pd.DataFrame | None, table_name: str, dim_col: str, mapping_table: str, mapping_id_col: str, ): """Persist a dimension table and its mapping table, merging with existing values.""" if dim_df is None or dim_df.empty: return if table_exists(table_name, con): existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con) merged = pd.concat([existing, dim_df], ignore_index=True) merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True) merged['id'] = range(len(merged)) else: merged = dim_df.copy() # Replace table with merged content merged.to_sql(table_name, con, if_exists="replace", index=False) if map_df is None or map_df.empty: return value_to_id = dict(zip(merged[dim_col], merged['id'])) map_df = map_df.copy() map_df[mapping_id_col] = map_df[dim_col].map(value_to_id) map_df = map_df[['post_id', mapping_id_col]].dropna() map_df.to_sql(mapping_table, con, if_exists="append", index=False) def download(id: int): if id == 0: return base_url = "https://knack.news/" url = f"{base_url}{id}" res = requests.get(url) # make sure we don't dos knack time.sleep(2) if not (200 <= res.status_code <= 300): return logger.debug("Found promising page with id %d!", id) content = res.content soup = BeautifulSoup(content, "html.parser") pC = soup.find("div", {"class": "postContent"}) if pC is None: # not a normal post logger.debug( "Page with id %d does not have a .pageContent-div. Skipping for now.", id ) return # every post has these fields title = pC.find("h3", {"class": "postTitle"}).text postText = pC.find("div", {"class": "postText"}) # these fields are possible but not required # TODO: cleanup try: date_parts = pC.find("span", {"class": "singledate"}).text.split(' ') day = int(date_parts[0][:-1]) months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12} month = months[date_parts[1]] year = int(date_parts[2]) parsed_date = datetime(year, month, day) except Exception: parsed_date = None try: author = pC.find("span", {"class": "author"}).text except AttributeError: author = None try: category = pC.find("span", {"class": "categoryInfo"}).find_all() category = [c.text for c in category if c.text != 'Alle Artikel'] category = ";".join(category) except AttributeError: category = None try: tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] tags = ";".join(tags) except AttributeError: tags = None img = pC.find("img", {"class": "postImage"}) if img is not None: img = img["src"] res_dict = { "id": id, "title": title, "author": author, "date": parsed_date, "category": category, "url": url, "img_link": img, "tags": tags, "text": postText.text, "html": str(postText), "scraped_at": datetime.now(), "is_cleaned": False } return res_dict def run_downloads(min_id: int, max_id: int, num_threads: int = 8): res = [] logger.info( "Started parallel scrape of posts from id %d to id %d using %d threads.", min_id, max_id - 1, num_threads, ) with ThreadPoolExecutor(max_workers=num_threads) as executor: # Use a list comprehension to create a list of futures futures = [executor.submit(download, i) for i in range(min_id, max_id)] for future in futures: post = future.result() if post is not None: res.append(post) postdf = pd.DataFrame(res) return postdf def main(): num_threads = int(os.environ.get("NUM_THREADS", 8)) n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite") logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}") con = sqlite3.connect(database_location) with con: if table_exists("posts", con): logger.info("found posts retrieved earlier") max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0] logger.info("Got max id %d!", max_id_in_db) else: logger.info("no posts scraped so far - starting from 0") max_id_in_db = -1 postdf = run_downloads( min_id=max_id_in_db + 1, max_id=max_id_in_db + n_scrapes, num_threads=num_threads, ) # Drop category and tags columns as they're stored in separate tables postdf = postdf.drop(columns=['category', 'tags']) postdf.to_sql("posts", con, if_exists="append", index=False) # Tags tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag') store_dimension_and_mapping( con, tag_dim, tag_map, table_name="tags", dim_col="tag", mapping_table="posttags", mapping_id_col="tag_id", ) # Categories category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category') store_dimension_and_mapping( con, category_dim, category_map, table_name="categories", dim_col="category", mapping_table="postcategories", mapping_id_col="category_id", ) logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}") if __name__ == "__main__": main()