#! python import locale import logging import os import sqlite3 import sys import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime import pandas as pd import requests import tqdm from bs4 import BeautifulSoup logger = logging.getLogger("knack-scraper") # ch = logging.StreamHandler() # formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") # ch.setFormatter(formatter) # ch.setLevel(logging.INFO) # logger.addHandler(ch) def table_exists(tablename: str, con: sqlite3.Connection): query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" return len(con.execute(query, [tablename]).fetchall()) > 0 def download(id: int): if id == 0: return base_url = "https://knack.news/" url = f"{base_url}{id}" res = requests.get(url) # make sure we don't dos knack time.sleep(2) if not (200 <= res.status_code <= 300): return logger.info("Found promising page with id %d!", id) content = res.content soup = BeautifulSoup(content, "html.parser") date_format = "%d. %B %Y" # TODO FIXME: this fails inside the docker container locale.setlocale(locale.LC_TIME, "de_DE") pC = soup.find("div", {"class": "postContent"}) if pC is None: # not a normal post logger.info( "Page with id %d does not have a .pageContent-div. Skipping for now.", id ) return # every post has these fields title = pC.find("h3", {"class": "postTitle"}).text postText = pC.find("div", {"class": "postText"}) # these fields are possible but not required # TODO: cleanup try: date_string = pC.find("span", {"class": "singledate"}).text parsed_date = datetime.strptime(date_string, date_format) except AttributeError: parsed_date = None try: author = pC.find("span", {"class": "author"}).text except AttributeError: author = None try: category = pC.find("span", {"class": "categoryInfo"}).find().text except AttributeError: category = None try: tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] except AttributeError: tags = None img = pC.find("img", {"class": "postImage"}) if img is not None: img = img["src"] res_dict = { "id": id, "title": title, "author": author, "date": parsed_date, "category": category, "url": url, "img_link": img, "tags": tags, "text": postText.text, "html": str(postText), "scraped_at": datetime.now(), } return res_dict def run_downloads(min_id: int, max_id: int, num_threads: int = 8): res = [] logger.info( "Started parallel scrape of posts from id %d to id %d using %d threads.", min_id, max_id - 1, num_threads, ) with ThreadPoolExecutor(max_workers=num_threads) as executor: # Use a list comprehension to create a list of futures futures = [executor.submit(download, i) for i in range(min_id, max_id)] for future in tqdm.tqdm( futures, total=max_id - min_id ): # tqdm to track progress post = future.result() if post is not None: res.append(post) # sqlite can't handle lists so let's convert them to a single row csv # TODO: make sure our database is properly normalized df = pd.DataFrame(res) df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None) return df def main(): num_threads = int(os.environ.get("NUM_THREADS", 8)) n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite") con = sqlite3.connect(database_location) with con: post_table_exists = table_exists("posts", con) if post_table_exists: logger.info("found posts retrieved earlier") # retrieve max post id from db so # we can skip retrieving known posts max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0] logger.info("Got max id %d!", max_id_in_db) else: logger.info("no posts scraped so far - starting from 0") # retrieve from 0 onwards max_id_in_db = -1 con = sqlite3.connect(database_location) df = run_downloads( min_id=max_id_in_db + 1, max_id=max_id_in_db + n_scrapes, num_threads=num_threads, ) df.to_sql("posts", con, if_exists="append") if __name__ == "__main__": main()