initial commit

This commit is contained in:
lukaszett 2023-10-19 22:53:36 +02:00
commit 7edf451e2e
6 changed files with 200 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
data/
venv/
.DS_STORE

15
Dockerfile Normal file
View file

@ -0,0 +1,15 @@
FROM python:slim
RUN mkdir /app
RUN mkdir /data
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN apt update -y
RUN apt install -y cron
COPY crontab .
RUN crontab crontab
COPY main.py .

2
Makefile Normal file
View file

@ -0,0 +1,2 @@
build:
docker build -t knack-scraper .

1
crontab Normal file
View file

@ -0,0 +1 @@
5 4 * * * python /app/main.py

165
main.py Executable file
View file

@ -0,0 +1,165 @@
#! python
import locale
import logging
import os
import sqlite3
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup
logger = logging.getLogger("knack-scraper")
# ch = logging.StreamHandler()
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# ch.setFormatter(formatter)
# ch.setLevel(logging.INFO)
# logger.addHandler(ch)
def table_exists(tablename: str, con: sqlite3.Connection):
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
return len(con.execute(query, [tablename]).fetchall()) > 0
def download(id: int):
if id == 0:
return
base_url = "https://knack.news/"
url = f"{base_url}{id}"
res = requests.get(url)
# make sure we don't dos knack
time.sleep(2)
if not (200 <= res.status_code <= 300):
return
logger.info("Found promising page with id %d!", id)
content = res.content
soup = BeautifulSoup(content, "html.parser")
date_format = "%d. %B %Y"
# TODO FIXME: this fails inside the docker container
locale.setlocale(locale.LC_TIME, "de_DE")
pC = soup.find("div", {"class": "postContent"})
if pC is None:
# not a normal post
logger.info(
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
)
return
# every post has these fields
title = pC.find("h3", {"class": "postTitle"}).text
postText = pC.find("div", {"class": "postText"})
# these fields are possible but not required
# TODO: cleanup
try:
date_string = pC.find("span", {"class": "singledate"}).text
parsed_date = datetime.strptime(date_string, date_format)
except AttributeError:
parsed_date = None
try:
author = pC.find("span", {"class": "author"}).text
except AttributeError:
author = None
try:
category = pC.find("span", {"class": "categoryInfo"}).find().text
except AttributeError:
category = None
try:
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
except AttributeError:
tags = None
img = pC.find("img", {"class": "postImage"})
if img is not None:
img = img["src"]
res_dict = {
"id": id,
"title": title,
"author": author,
"date": parsed_date,
"category": category,
"url": url,
"img_link": img,
"tags": tags,
"text": postText.text,
"html": str(postText),
"scraped_at": datetime.now(),
}
return res_dict
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
res = []
logger.info(
"Started parallel scrape of posts from id %d to id %d using %d threads.",
min_id,
max_id - 1,
num_threads,
)
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Use a list comprehension to create a list of futures
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
for future in tqdm.tqdm(
futures, total=max_id - min_id
): # tqdm to track progress
post = future.result()
if post is not None:
res.append(post)
# sqlite can't handle lists so let's convert them to a single row csv
# TODO: make sure our database is properly normalized
df = pd.DataFrame(res)
df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
return df
def main():
num_threads = int(os.environ.get("NUM_THREADS", 8))
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
con = sqlite3.connect(database_location)
with con:
post_table_exists = table_exists("posts", con)
if post_table_exists:
logger.info("found posts retrieved earlier")
# retrieve max post id from db so
# we can skip retrieving known posts
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
logger.info("Got max id %d!", max_id_in_db)
else:
logger.info("no posts scraped so far - starting from 0")
# retrieve from 0 onwards
max_id_in_db = -1
con = sqlite3.connect(database_location)
df = run_downloads(
min_id=max_id_in_db + 1,
max_id=max_id_in_db + n_scrapes,
num_threads=num_threads,
)
df.to_sql("posts", con, if_exists="append")
if __name__ == "__main__":
main()

14
requirements.txt Normal file
View file

@ -0,0 +1,14 @@
beautifulsoup4==4.12.2
certifi==2023.7.22
charset-normalizer==3.3.0
idna==3.4
numpy==1.26.1
pandas==2.1.1
python-dateutil==2.8.2
pytz==2023.3.post1
requests==2.31.0
six==1.16.0
soupsieve==2.5
tqdm==4.66.1
tzdata==2023.3
urllib3==2.0.7