initial commit
This commit is contained in:
commit
7edf451e2e
6 changed files with 200 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
data/
|
||||||
|
venv/
|
||||||
|
.DS_STORE
|
15
Dockerfile
Normal file
15
Dockerfile
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
FROM python:slim
|
||||||
|
|
||||||
|
RUN mkdir /app
|
||||||
|
RUN mkdir /data
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
RUN apt update -y
|
||||||
|
RUN apt install -y cron
|
||||||
|
COPY crontab .
|
||||||
|
RUN crontab crontab
|
||||||
|
|
||||||
|
COPY main.py .
|
2
Makefile
Normal file
2
Makefile
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
build:
|
||||||
|
docker build -t knack-scraper .
|
1
crontab
Normal file
1
crontab
Normal file
|
@ -0,0 +1 @@
|
||||||
|
5 4 * * * python /app/main.py
|
165
main.py
Executable file
165
main.py
Executable file
|
@ -0,0 +1,165 @@
|
||||||
|
#! python
|
||||||
|
import locale
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
import tqdm
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
logger = logging.getLogger("knack-scraper")
|
||||||
|
# ch = logging.StreamHandler()
|
||||||
|
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||||
|
# ch.setFormatter(formatter)
|
||||||
|
# ch.setLevel(logging.INFO)
|
||||||
|
# logger.addHandler(ch)
|
||||||
|
|
||||||
|
|
||||||
|
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||||
|
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||||
|
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def download(id: int):
|
||||||
|
if id == 0:
|
||||||
|
return
|
||||||
|
base_url = "https://knack.news/"
|
||||||
|
url = f"{base_url}{id}"
|
||||||
|
res = requests.get(url)
|
||||||
|
|
||||||
|
# make sure we don't dos knack
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
if not (200 <= res.status_code <= 300):
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Found promising page with id %d!", id)
|
||||||
|
|
||||||
|
content = res.content
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
date_format = "%d. %B %Y"
|
||||||
|
|
||||||
|
# TODO FIXME: this fails inside the docker container
|
||||||
|
locale.setlocale(locale.LC_TIME, "de_DE")
|
||||||
|
pC = soup.find("div", {"class": "postContent"})
|
||||||
|
|
||||||
|
if pC is None:
|
||||||
|
# not a normal post
|
||||||
|
logger.info(
|
||||||
|
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# every post has these fields
|
||||||
|
title = pC.find("h3", {"class": "postTitle"}).text
|
||||||
|
postText = pC.find("div", {"class": "postText"})
|
||||||
|
|
||||||
|
# these fields are possible but not required
|
||||||
|
# TODO: cleanup
|
||||||
|
try:
|
||||||
|
date_string = pC.find("span", {"class": "singledate"}).text
|
||||||
|
parsed_date = datetime.strptime(date_string, date_format)
|
||||||
|
except AttributeError:
|
||||||
|
parsed_date = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
author = pC.find("span", {"class": "author"}).text
|
||||||
|
except AttributeError:
|
||||||
|
author = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
category = pC.find("span", {"class": "categoryInfo"}).find().text
|
||||||
|
except AttributeError:
|
||||||
|
category = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
|
||||||
|
except AttributeError:
|
||||||
|
tags = None
|
||||||
|
|
||||||
|
img = pC.find("img", {"class": "postImage"})
|
||||||
|
if img is not None:
|
||||||
|
img = img["src"]
|
||||||
|
|
||||||
|
res_dict = {
|
||||||
|
"id": id,
|
||||||
|
"title": title,
|
||||||
|
"author": author,
|
||||||
|
"date": parsed_date,
|
||||||
|
"category": category,
|
||||||
|
"url": url,
|
||||||
|
"img_link": img,
|
||||||
|
"tags": tags,
|
||||||
|
"text": postText.text,
|
||||||
|
"html": str(postText),
|
||||||
|
"scraped_at": datetime.now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
return res_dict
|
||||||
|
|
||||||
|
|
||||||
|
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
||||||
|
res = []
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Started parallel scrape of posts from id %d to id %d using %d threads.",
|
||||||
|
min_id,
|
||||||
|
max_id - 1,
|
||||||
|
num_threads,
|
||||||
|
)
|
||||||
|
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||||
|
# Use a list comprehension to create a list of futures
|
||||||
|
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
|
||||||
|
|
||||||
|
for future in tqdm.tqdm(
|
||||||
|
futures, total=max_id - min_id
|
||||||
|
): # tqdm to track progress
|
||||||
|
post = future.result()
|
||||||
|
if post is not None:
|
||||||
|
res.append(post)
|
||||||
|
|
||||||
|
# sqlite can't handle lists so let's convert them to a single row csv
|
||||||
|
# TODO: make sure our database is properly normalized
|
||||||
|
df = pd.DataFrame(res)
|
||||||
|
df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
||||||
|
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
||||||
|
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
|
||||||
|
|
||||||
|
con = sqlite3.connect(database_location)
|
||||||
|
with con:
|
||||||
|
post_table_exists = table_exists("posts", con)
|
||||||
|
|
||||||
|
if post_table_exists:
|
||||||
|
logger.info("found posts retrieved earlier")
|
||||||
|
# retrieve max post id from db so
|
||||||
|
# we can skip retrieving known posts
|
||||||
|
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
|
||||||
|
logger.info("Got max id %d!", max_id_in_db)
|
||||||
|
else:
|
||||||
|
logger.info("no posts scraped so far - starting from 0")
|
||||||
|
# retrieve from 0 onwards
|
||||||
|
max_id_in_db = -1
|
||||||
|
|
||||||
|
con = sqlite3.connect(database_location)
|
||||||
|
df = run_downloads(
|
||||||
|
min_id=max_id_in_db + 1,
|
||||||
|
max_id=max_id_in_db + n_scrapes,
|
||||||
|
num_threads=num_threads,
|
||||||
|
)
|
||||||
|
df.to_sql("posts", con, if_exists="append")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
14
requirements.txt
Normal file
14
requirements.txt
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
certifi==2023.7.22
|
||||||
|
charset-normalizer==3.3.0
|
||||||
|
idna==3.4
|
||||||
|
numpy==1.26.1
|
||||||
|
pandas==2.1.1
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2023.3.post1
|
||||||
|
requests==2.31.0
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.5
|
||||||
|
tqdm==4.66.1
|
||||||
|
tzdata==2023.3
|
||||||
|
urllib3==2.0.7
|
Loading…
Reference in a new issue