Dockerized Scraper
- Implements Dockerized Version of Scraper - Atomized tags and categories columns
This commit is contained in:
parent
7cc3d1b7e4
commit
bcd210ce01
5 changed files with 201 additions and 42 deletions
18
Dockerfile
18
Dockerfile
|
|
@ -7,9 +7,21 @@ WORKDIR /app
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY .env .
|
||||||
|
|
||||||
RUN apt update -y
|
RUN apt update -y
|
||||||
RUN apt install -y cron
|
RUN apt install -y cron locales
|
||||||
COPY crontab .
|
|
||||||
RUN crontab crontab
|
|
||||||
|
|
||||||
COPY main.py .
|
COPY main.py .
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV LANG=de_DE.UTF-8
|
||||||
|
ENV LC_ALL=de_DE.UTF-8
|
||||||
|
|
||||||
|
# Create cron job that runs every 15 minutes with environment variables
|
||||||
|
RUN echo "*/10 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
|
||||||
|
RUN chmod 0644 /etc/cron.d/knack-scraper
|
||||||
|
RUN crontab /etc/cron.d/knack-scraper
|
||||||
|
|
||||||
|
# Start cron in foreground
|
||||||
|
CMD ["cron", "-f"]
|
||||||
18
README.md
18
README.md
|
|
@ -0,0 +1,18 @@
|
||||||
|
Knack-Scraper does exacly what its name suggests it does.
|
||||||
|
Knack-Scraper scrapes knack.news and writes to an sqlite
|
||||||
|
database for later usage.
|
||||||
|
|
||||||
|
## Example for .env
|
||||||
|
|
||||||
|
```
|
||||||
|
NUM_THREADS=8
|
||||||
|
NUM_SCRAPES=100
|
||||||
|
DATABASE_LOCATION='./data/knack.sqlite'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run once
|
||||||
|
|
||||||
|
```
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
1
crontab
1
crontab
|
|
@ -1 +0,0 @@
|
||||||
5 4 * * * python /app/main.py
|
|
||||||
183
main.py
183
main.py
|
|
@ -1,25 +1,34 @@
|
||||||
#! python3
|
#! python3
|
||||||
import locale
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import sys
|
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import tqdm
|
import tqdm
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
logger = logging.getLogger("knack-scraper")
|
load_dotenv()
|
||||||
# ch = logging.StreamHandler()
|
|
||||||
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
||||||
# ch.setFormatter(formatter)
|
|
||||||
# ch.setLevel(logging.INFO)
|
|
||||||
# logger.addHandler(ch)
|
|
||||||
|
|
||||||
|
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||||
|
logging_level = logging.INFO
|
||||||
|
else:
|
||||||
|
logging_level = logging.DEBUG
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging_level,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler("app.log"),
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("knack-scraper")
|
||||||
|
|
||||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||||
|
|
@ -39,19 +48,16 @@ def download(id: int):
|
||||||
if not (200 <= res.status_code <= 300):
|
if not (200 <= res.status_code <= 300):
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("Found promising page with id %d!", id)
|
logger.debug("Found promising page with id %d!", id)
|
||||||
|
|
||||||
content = res.content
|
content = res.content
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
date_format = "%d. %B %Y"
|
|
||||||
|
|
||||||
# TODO FIXME: this fails inside the docker container
|
|
||||||
locale.setlocale(locale.LC_TIME, "de_DE")
|
|
||||||
pC = soup.find("div", {"class": "postContent"})
|
pC = soup.find("div", {"class": "postContent"})
|
||||||
|
|
||||||
if pC is None:
|
if pC is None:
|
||||||
# not a normal post
|
# not a normal post
|
||||||
logger.info(
|
logger.debug(
|
||||||
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
@ -63,9 +69,13 @@ def download(id: int):
|
||||||
# these fields are possible but not required
|
# these fields are possible but not required
|
||||||
# TODO: cleanup
|
# TODO: cleanup
|
||||||
try:
|
try:
|
||||||
date_string = pC.find("span", {"class": "singledate"}).text
|
date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
|
||||||
parsed_date = datetime.strptime(date_string, date_format)
|
day = int(date_parts[0][:-1])
|
||||||
except AttributeError:
|
months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
|
||||||
|
month = months[date_parts[1]]
|
||||||
|
year = int(date_parts[2])
|
||||||
|
parsed_date = datetime(year, month, day)
|
||||||
|
except Exception:
|
||||||
parsed_date = None
|
parsed_date = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -75,7 +85,7 @@ def download(id: int):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
||||||
category = [c.text for c in category]
|
category = [c.text for c in category if c.text != 'Alle Artikel']
|
||||||
category = ";".join(category)
|
category = ";".join(category)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
category = None
|
category = None
|
||||||
|
|
@ -129,15 +139,79 @@ def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
||||||
|
|
||||||
# sqlite can't handle lists so let's convert them to a single row csv
|
# sqlite can't handle lists so let's convert them to a single row csv
|
||||||
# TODO: make sure our database is properly normalized
|
# TODO: make sure our database is properly normalized
|
||||||
df = pd.DataFrame(res)
|
postdf = pd.DataFrame(res)
|
||||||
|
tagdf = None
|
||||||
|
posttotagdf = None
|
||||||
|
categorydf = None
|
||||||
|
postcategorydf = None
|
||||||
|
|
||||||
return df
|
# Extract and create tags dataframe
|
||||||
|
if not postdf.empty and 'tags' in postdf.columns:
|
||||||
|
# Collect all unique tags
|
||||||
|
all_tags = set()
|
||||||
|
for tags_str in postdf['tags']:
|
||||||
|
if pd.notna(tags_str):
|
||||||
|
tags_list = [tag.strip() for tag in tags_str.split(';')]
|
||||||
|
all_tags.update(tags_list)
|
||||||
|
|
||||||
|
# Create tagdf with id and text columns
|
||||||
|
if all_tags:
|
||||||
|
all_tags = sorted(list(all_tags))
|
||||||
|
tagdf = pd.DataFrame({
|
||||||
|
'id': range(len(all_tags)),
|
||||||
|
'tag': all_tags
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create posttotagdf mapping table
|
||||||
|
rows = []
|
||||||
|
for post_id, tags_str in zip(postdf['id'], postdf['tags']):
|
||||||
|
if pd.notna(tags_str):
|
||||||
|
tags_list = [tag.strip() for tag in tags_str.split(';')]
|
||||||
|
for tag_text in tags_list:
|
||||||
|
tag_id = tagdf[tagdf['tag'] == tag_text]['id'].values[0]
|
||||||
|
rows.append({'post_id': post_id, 'tag_id': tag_id})
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
posttotagdf = pd.DataFrame(rows)
|
||||||
|
|
||||||
|
# Extract and create categories dataframe
|
||||||
|
if not postdf.empty and 'category' in postdf.columns:
|
||||||
|
# Collect all unique categories
|
||||||
|
all_categories = set()
|
||||||
|
for category_str in postdf['category']:
|
||||||
|
if pd.notna(category_str):
|
||||||
|
category_list = [cat.strip() for cat in category_str.split(';')]
|
||||||
|
all_categories.update(category_list)
|
||||||
|
|
||||||
|
# Create categorydf with id and category columns
|
||||||
|
if all_categories:
|
||||||
|
all_categories = sorted(list(all_categories))
|
||||||
|
categorydf = pd.DataFrame({
|
||||||
|
'id': range(len(all_categories)),
|
||||||
|
'category': all_categories
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create postcategorydf mapping table
|
||||||
|
rows = []
|
||||||
|
for post_id, category_str in zip(postdf['id'], postdf['category']):
|
||||||
|
if pd.notna(category_str):
|
||||||
|
category_list = [cat.strip() for cat in category_str.split(';')]
|
||||||
|
for category_text in category_list:
|
||||||
|
category_id = categorydf[categorydf['category'] == category_text]['id'].values[0]
|
||||||
|
rows.append({'post_id': post_id, 'category_id': category_id})
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
postcategorydf = pd.DataFrame(rows)
|
||||||
|
|
||||||
|
return postdf, tagdf, posttotagdf, categorydf, postcategorydf
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
||||||
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
||||||
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
|
database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
|
||||||
|
|
||||||
|
logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
|
||||||
|
|
||||||
con = sqlite3.connect(database_location)
|
con = sqlite3.connect(database_location)
|
||||||
with con:
|
with con:
|
||||||
|
|
@ -155,12 +229,77 @@ def main():
|
||||||
max_id_in_db = -1
|
max_id_in_db = -1
|
||||||
|
|
||||||
con = sqlite3.connect(database_location)
|
con = sqlite3.connect(database_location)
|
||||||
df = run_downloads(
|
postdf, tagdf, posttotagdf, categorydf, postcategorydf = run_downloads(
|
||||||
min_id=max_id_in_db + 1,
|
min_id=max_id_in_db + 1,
|
||||||
max_id=max_id_in_db + n_scrapes,
|
max_id=max_id_in_db + n_scrapes,
|
||||||
num_threads=num_threads,
|
num_threads=num_threads,
|
||||||
)
|
)
|
||||||
df.to_sql("posts", con, if_exists="append")
|
postdf.to_sql("posts", con, if_exists="append")
|
||||||
|
|
||||||
|
# Handle tags dataframe merging and storage
|
||||||
|
if tagdf is not None and not tagdf.empty:
|
||||||
|
# Check if tags table already exists
|
||||||
|
if table_exists("tags", con):
|
||||||
|
# Read existing tags from database
|
||||||
|
existing_tagdf = pd.read_sql("SELECT id, tag FROM tags", con)
|
||||||
|
|
||||||
|
# Merge new tags with existing tags, avoiding duplicates
|
||||||
|
merged_tagdf = pd.concat([existing_tagdf, tagdf], ignore_index=False)
|
||||||
|
merged_tagdf = merged_tagdf.drop_duplicates(subset=['tag'], keep='first')
|
||||||
|
merged_tagdf = merged_tagdf.reset_index(drop=True)
|
||||||
|
merged_tagdf['id'] = range(len(merged_tagdf))
|
||||||
|
|
||||||
|
# Drop the old table and insert the merged data
|
||||||
|
con.execute("DROP TABLE tags")
|
||||||
|
con.commit()
|
||||||
|
merged_tagdf.to_sql("tags", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
# Update tag_id references in posttotagdf
|
||||||
|
if posttotagdf is not None and not posttotagdf.empty:
|
||||||
|
#tag_mapping = dict(zip(tagdf['tag'], tagdf['id']))
|
||||||
|
posttotagdf['tag_id'] = posttotagdf['tag_id'].map(
|
||||||
|
lambda old_id: merged_tagdf[merged_tagdf['tag'] == tagdf.loc[old_id, 'tag']]['id'].values[0]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# First time creating tags table
|
||||||
|
tagdf.to_sql("tags", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
# Store posttags (post to tags mapping)
|
||||||
|
if posttotagdf is not None and not posttotagdf.empty:
|
||||||
|
posttotagdf.to_sql("posttags", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
# Handle categories dataframe merging and storage
|
||||||
|
if categorydf is not None and not categorydf.empty:
|
||||||
|
# Check if categories table already exists
|
||||||
|
if table_exists("categories", con):
|
||||||
|
# Read existing categories from database
|
||||||
|
existing_categorydf = pd.read_sql("SELECT id, category FROM categories", con)
|
||||||
|
|
||||||
|
# Merge new categories with existing categories, avoiding duplicates
|
||||||
|
merged_categorydf = pd.concat([existing_categorydf, categorydf], ignore_index=False)
|
||||||
|
merged_categorydf = merged_categorydf.drop_duplicates(subset=['category'], keep='first')
|
||||||
|
merged_categorydf = merged_categorydf.reset_index(drop=True)
|
||||||
|
merged_categorydf['id'] = range(len(merged_categorydf))
|
||||||
|
|
||||||
|
# Drop the old table and insert the merged data
|
||||||
|
con.execute("DROP TABLE categories")
|
||||||
|
con.commit()
|
||||||
|
merged_categorydf.to_sql("categories", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
# Update category_id references in postcategorydf
|
||||||
|
if postcategorydf is not None and not postcategorydf.empty:
|
||||||
|
postcategorydf['category_id'] = postcategorydf['category_id'].map(
|
||||||
|
lambda old_id: merged_categorydf[merged_categorydf['category'] == categorydf.loc[old_id, 'category']]['id'].values[0]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# First time creating categories table
|
||||||
|
categorydf.to_sql("categories", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
# Store postcategories (post to categories mapping)
|
||||||
|
if postcategorydf is not None and not postcategorydf.empty:
|
||||||
|
postcategorydf.to_sql("postcategories", con, if_exists="append", index=False)
|
||||||
|
|
||||||
|
logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,5 @@
|
||||||
beautifulsoup4==4.12.2
|
pandas
|
||||||
certifi==2023.7.22
|
requests
|
||||||
charset-normalizer==3.3.0
|
tqdm
|
||||||
idna==3.4
|
bs4
|
||||||
numpy==1.26.1
|
dotenv
|
||||||
pandas==2.1.1
|
|
||||||
python-dateutil==2.8.2
|
|
||||||
pytz==2023.3.post1
|
|
||||||
requests==2.31.0
|
|
||||||
six==1.16.0
|
|
||||||
soupsieve==2.5
|
|
||||||
tqdm==4.66.1
|
|
||||||
tzdata==2023.3
|
|
||||||
urllib3==2.0.7
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue