From 7edf451e2ed4a13f09b2b87a02aa6b109da4c966 Mon Sep 17 00:00:00 2001 From: lukaszett <4248383+lukaszett@users.noreply.github.com> Date: Thu, 19 Oct 2023 22:53:36 +0200 Subject: [PATCH] initial commit --- .gitignore | 3 + Dockerfile | 15 +++++ Makefile | 2 + crontab | 1 + main.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 14 ++++ 6 files changed, 200 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 crontab create mode 100755 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2e7a5cf --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +data/ +venv/ +.DS_STORE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9c94fd6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:slim + +RUN mkdir /app +RUN mkdir /data + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +RUN apt update -y +RUN apt install -y cron +COPY crontab . +RUN crontab crontab + +COPY main.py . \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a669090 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +build: + docker build -t knack-scraper . \ No newline at end of file diff --git a/crontab b/crontab new file mode 100644 index 0000000..6b6ae11 --- /dev/null +++ b/crontab @@ -0,0 +1 @@ +5 4 * * * python /app/main.py diff --git a/main.py b/main.py new file mode 100755 index 0000000..5305653 --- /dev/null +++ b/main.py @@ -0,0 +1,165 @@ +#! python +import locale +import logging +import os +import sqlite3 +import sys +import time +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime + +import pandas as pd +import requests +import tqdm +from bs4 import BeautifulSoup + +logger = logging.getLogger("knack-scraper") +# ch = logging.StreamHandler() +# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +# ch.setFormatter(formatter) +# ch.setLevel(logging.INFO) +# logger.addHandler(ch) + + +def table_exists(tablename: str, con: sqlite3.Connection): + query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" + return len(con.execute(query, [tablename]).fetchall()) > 0 + + +def download(id: int): + if id == 0: + return + base_url = "https://knack.news/" + url = f"{base_url}{id}" + res = requests.get(url) + + # make sure we don't dos knack + time.sleep(2) + + if not (200 <= res.status_code <= 300): + return + + logger.info("Found promising page with id %d!", id) + + content = res.content + soup = BeautifulSoup(content, "html.parser") + date_format = "%d. %B %Y" + + # TODO FIXME: this fails inside the docker container + locale.setlocale(locale.LC_TIME, "de_DE") + pC = soup.find("div", {"class": "postContent"}) + + if pC is None: + # not a normal post + logger.info( + "Page with id %d does not have a .pageContent-div. Skipping for now.", id + ) + return + + # every post has these fields + title = pC.find("h3", {"class": "postTitle"}).text + postText = pC.find("div", {"class": "postText"}) + + # these fields are possible but not required + # TODO: cleanup + try: + date_string = pC.find("span", {"class": "singledate"}).text + parsed_date = datetime.strptime(date_string, date_format) + except AttributeError: + parsed_date = None + + try: + author = pC.find("span", {"class": "author"}).text + except AttributeError: + author = None + + try: + category = pC.find("span", {"class": "categoryInfo"}).find().text + except AttributeError: + category = None + + try: + tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] + except AttributeError: + tags = None + + img = pC.find("img", {"class": "postImage"}) + if img is not None: + img = img["src"] + + res_dict = { + "id": id, + "title": title, + "author": author, + "date": parsed_date, + "category": category, + "url": url, + "img_link": img, + "tags": tags, + "text": postText.text, + "html": str(postText), + "scraped_at": datetime.now(), + } + + return res_dict + + +def run_downloads(min_id: int, max_id: int, num_threads: int = 8): + res = [] + + logger.info( + "Started parallel scrape of posts from id %d to id %d using %d threads.", + min_id, + max_id - 1, + num_threads, + ) + with ThreadPoolExecutor(max_workers=num_threads) as executor: + # Use a list comprehension to create a list of futures + futures = [executor.submit(download, i) for i in range(min_id, max_id)] + + for future in tqdm.tqdm( + futures, total=max_id - min_id + ): # tqdm to track progress + post = future.result() + if post is not None: + res.append(post) + + # sqlite can't handle lists so let's convert them to a single row csv + # TODO: make sure our database is properly normalized + df = pd.DataFrame(res) + df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None) + + return df + + +def main(): + num_threads = int(os.environ.get("NUM_THREADS", 8)) + n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) + database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite") + + con = sqlite3.connect(database_location) + with con: + post_table_exists = table_exists("posts", con) + + if post_table_exists: + logger.info("found posts retrieved earlier") + # retrieve max post id from db so + # we can skip retrieving known posts + max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0] + logger.info("Got max id %d!", max_id_in_db) + else: + logger.info("no posts scraped so far - starting from 0") + # retrieve from 0 onwards + max_id_in_db = -1 + + con = sqlite3.connect(database_location) + df = run_downloads( + min_id=max_id_in_db + 1, + max_id=max_id_in_db + n_scrapes, + num_threads=num_threads, + ) + df.to_sql("posts", con, if_exists="append") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7792d83 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +beautifulsoup4==4.12.2 +certifi==2023.7.22 +charset-normalizer==3.3.0 +idna==3.4 +numpy==1.26.1 +pandas==2.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +requests==2.31.0 +six==1.16.0 +soupsieve==2.5 +tqdm==4.66.1 +tzdata==2023.3 +urllib3==2.0.7