forked from lukaszett/Knack-Scraper
Compare commits
No commits in common. "main" and "main" have entirely different histories.
30 changed files with 199 additions and 4546 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,6 +1,3 @@
|
|||
data/
|
||||
venv/
|
||||
experiment/
|
||||
__pycache__/
|
||||
.DS_STORE
|
||||
.env
|
||||
15
Dockerfile
Normal file
15
Dockerfile
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
FROM python:slim
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /data
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
RUN apt update -y
|
||||
RUN apt install -y cron
|
||||
COPY crontab .
|
||||
RUN crontab crontab
|
||||
|
||||
COPY main.py .
|
||||
14
Makefile
14
Makefile
|
|
@ -1,12 +1,2 @@
|
|||
volume:
|
||||
docker volume create knack_data
|
||||
|
||||
stop:
|
||||
docker stop knack-scraper || true
|
||||
docker rm knack-scraper || true
|
||||
|
||||
up:
|
||||
docker compose up -d
|
||||
|
||||
down:
|
||||
docker compose down
|
||||
build:
|
||||
docker build -t knack-scraper .
|
||||
18
README.md
18
README.md
|
|
@ -1,18 +0,0 @@
|
|||
Knack-Scraper does exacly what its name suggests it does.
|
||||
Knack-Scraper scrapes knack.news and writes to an sqlite
|
||||
database for later usage.
|
||||
|
||||
## Example for .env
|
||||
|
||||
```
|
||||
NUM_THREADS=8
|
||||
NUM_SCRAPES=100
|
||||
DATABASE_LOCATION='./data/knack.sqlite'
|
||||
```
|
||||
|
||||
## Run once
|
||||
|
||||
```
|
||||
python main.py
|
||||
```
|
||||
|
||||
1
crontab
Normal file
1
crontab
Normal file
|
|
@ -0,0 +1 @@
|
|||
5 4 * * * python /app/main.py
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
services:
|
||||
scraper:
|
||||
build:
|
||||
context: ./scrape
|
||||
dockerfile: Dockerfile
|
||||
image: knack-scraper
|
||||
container_name: knack-scraper
|
||||
env_file:
|
||||
- scrape/.env
|
||||
volumes:
|
||||
- knack_data:/data
|
||||
restart: unless-stopped
|
||||
|
||||
transform:
|
||||
build:
|
||||
context: ./transform
|
||||
dockerfile: Dockerfile
|
||||
image: knack-transform
|
||||
container_name: knack-transform
|
||||
env_file:
|
||||
- transform/.env
|
||||
volumes:
|
||||
- knack_data:/data
|
||||
- models:/models
|
||||
restart: unless-stopped
|
||||
|
||||
explorer:
|
||||
build:
|
||||
context: ./explorer
|
||||
dockerfile: Dockerfile
|
||||
image: knack-explorer
|
||||
container_name: knack-explorer
|
||||
environment:
|
||||
- PORT=4173
|
||||
- SQLITE_PATH=/data/knack.sqlite
|
||||
volumes:
|
||||
- knack_data:/data:ro
|
||||
ports:
|
||||
- "4173:4173"
|
||||
depends_on:
|
||||
- transform
|
||||
restart: unless-stopped
|
||||
|
||||
sqlitebrowser:
|
||||
image: lscr.io/linuxserver/sqlitebrowser:latest
|
||||
container_name: sqlitebrowser
|
||||
environment:
|
||||
- PUID=1000
|
||||
- PGID=1000
|
||||
- TZ=Etc/UTC
|
||||
volumes:
|
||||
- knack_data:/data
|
||||
ports:
|
||||
- "3000:3000" # noVNC web UI
|
||||
- "3001:3001" # VNC server
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
knack_data:
|
||||
models:
|
||||
167
main.py
Executable file
167
main.py
Executable file
|
|
@ -0,0 +1,167 @@
|
|||
#! python3
|
||||
import locale
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger("knack-scraper")
|
||||
# ch = logging.StreamHandler()
|
||||
# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
# ch.setFormatter(formatter)
|
||||
# ch.setLevel(logging.INFO)
|
||||
# logger.addHandler(ch)
|
||||
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def download(id: int):
|
||||
if id == 0:
|
||||
return
|
||||
base_url = "https://knack.news/"
|
||||
url = f"{base_url}{id}"
|
||||
res = requests.get(url)
|
||||
|
||||
# make sure we don't dos knack
|
||||
time.sleep(2)
|
||||
|
||||
if not (200 <= res.status_code <= 300):
|
||||
return
|
||||
|
||||
logger.info("Found promising page with id %d!", id)
|
||||
|
||||
content = res.content
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
date_format = "%d. %B %Y"
|
||||
|
||||
# TODO FIXME: this fails inside the docker container
|
||||
locale.setlocale(locale.LC_TIME, "de_DE")
|
||||
pC = soup.find("div", {"class": "postContent"})
|
||||
|
||||
if pC is None:
|
||||
# not a normal post
|
||||
logger.info(
|
||||
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
||||
)
|
||||
return
|
||||
|
||||
# every post has these fields
|
||||
title = pC.find("h3", {"class": "postTitle"}).text
|
||||
postText = pC.find("div", {"class": "postText"})
|
||||
|
||||
# these fields are possible but not required
|
||||
# TODO: cleanup
|
||||
try:
|
||||
date_string = pC.find("span", {"class": "singledate"}).text
|
||||
parsed_date = datetime.strptime(date_string, date_format)
|
||||
except AttributeError:
|
||||
parsed_date = None
|
||||
|
||||
try:
|
||||
author = pC.find("span", {"class": "author"}).text
|
||||
except AttributeError:
|
||||
author = None
|
||||
|
||||
try:
|
||||
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
||||
category = [c.text for c in category]
|
||||
category = ";".join(category)
|
||||
except AttributeError:
|
||||
category = None
|
||||
|
||||
try:
|
||||
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
|
||||
tags = ";".join(tags)
|
||||
except AttributeError:
|
||||
tags = None
|
||||
|
||||
img = pC.find("img", {"class": "postImage"})
|
||||
if img is not None:
|
||||
img = img["src"]
|
||||
|
||||
res_dict = {
|
||||
"id": id,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"date": parsed_date,
|
||||
"category": category,
|
||||
"url": url,
|
||||
"img_link": img,
|
||||
"tags": tags,
|
||||
"text": postText.text,
|
||||
"html": str(postText),
|
||||
"scraped_at": datetime.now(),
|
||||
}
|
||||
|
||||
return res_dict
|
||||
|
||||
|
||||
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
||||
res = []
|
||||
|
||||
logger.info(
|
||||
"Started parallel scrape of posts from id %d to id %d using %d threads.",
|
||||
min_id,
|
||||
max_id - 1,
|
||||
num_threads,
|
||||
)
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# Use a list comprehension to create a list of futures
|
||||
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
|
||||
|
||||
for future in tqdm.tqdm(
|
||||
futures, total=max_id - min_id
|
||||
): # tqdm to track progress
|
||||
post = future.result()
|
||||
if post is not None:
|
||||
res.append(post)
|
||||
|
||||
# sqlite can't handle lists so let's convert them to a single row csv
|
||||
# TODO: make sure our database is properly normalized
|
||||
df = pd.DataFrame(res)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
||||
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
||||
database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
|
||||
|
||||
con = sqlite3.connect(database_location)
|
||||
with con:
|
||||
post_table_exists = table_exists("posts", con)
|
||||
|
||||
if post_table_exists:
|
||||
logger.info("found posts retrieved earlier")
|
||||
# retrieve max post id from db so
|
||||
# we can skip retrieving known posts
|
||||
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
|
||||
logger.info("Got max id %d!", max_id_in_db)
|
||||
else:
|
||||
logger.info("no posts scraped so far - starting from 0")
|
||||
# retrieve from 0 onwards
|
||||
max_id_in_db = -1
|
||||
|
||||
con = sqlite3.connect(database_location)
|
||||
df = run_downloads(
|
||||
min_id=max_id_in_db + 1,
|
||||
max_id=max_id_in_db + n_scrapes,
|
||||
num_threads=num_threads,
|
||||
)
|
||||
df.to_sql("posts", con, if_exists="append")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
requirements.txt
Normal file
14
requirements.txt
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
beautifulsoup4==4.12.2
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.3.0
|
||||
idna==3.4
|
||||
numpy==1.26.1
|
||||
pandas==2.1.1
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3.post1
|
||||
requests==2.31.0
|
||||
six==1.16.0
|
||||
soupsieve==2.5
|
||||
tqdm==4.66.1
|
||||
tzdata==2023.3
|
||||
urllib3==2.0.7
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
FROM python:slim
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /data
|
||||
|
||||
#COPY /data/knack.sqlite /data
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY .env .
|
||||
|
||||
RUN apt update -y
|
||||
RUN apt install -y cron locales
|
||||
|
||||
COPY main.py .
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV LANG=de_DE.UTF-8
|
||||
ENV LC_ALL=de_DE.UTF-8
|
||||
|
||||
# Create cron job that runs every 15 minutes with environment variables
|
||||
RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
|
||||
RUN chmod 0644 /etc/cron.d/knack-scraper
|
||||
RUN crontab /etc/cron.d/knack-scraper
|
||||
|
||||
# Start cron in foreground
|
||||
CMD ["cron", "-f"]
|
||||
262
scrape/main.py
262
scrape/main.py
|
|
@ -1,262 +0,0 @@
|
|||
#! python3
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
load_dotenv()
|
||||
|
||||
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||
logging_level = logging.INFO
|
||||
else:
|
||||
logging_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-scraper")
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def split_semicolon_list(value: str):
|
||||
if pd.isna(value):
|
||||
return []
|
||||
return [item.strip() for item in str(value).split(';') if item.strip()]
|
||||
|
||||
|
||||
def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
|
||||
"""Extract unique dimension values and post-to-dimension mappings from a column."""
|
||||
if postdf.empty or field_name not in postdf.columns:
|
||||
return None, None
|
||||
|
||||
values = set()
|
||||
mapping_rows = []
|
||||
|
||||
for post_id, raw in zip(postdf['id'], postdf[field_name]):
|
||||
items = split_semicolon_list(raw)
|
||||
for item in items:
|
||||
values.add(item)
|
||||
mapping_rows.append({'post_id': post_id, dim_col: item})
|
||||
|
||||
if not values:
|
||||
return None, None
|
||||
|
||||
dim_df = pd.DataFrame({
|
||||
'id': range(len(values)),
|
||||
dim_col: sorted(values),
|
||||
})
|
||||
map_df = pd.DataFrame(mapping_rows)
|
||||
return dim_df, map_df
|
||||
|
||||
|
||||
def store_dimension_and_mapping(
|
||||
con: sqlite3.Connection,
|
||||
dim_df: pd.DataFrame | None,
|
||||
map_df: pd.DataFrame | None,
|
||||
table_name: str,
|
||||
dim_col: str,
|
||||
mapping_table: str,
|
||||
mapping_id_col: str,
|
||||
):
|
||||
"""Persist a dimension table and its mapping table, merging with existing values."""
|
||||
if dim_df is None or dim_df.empty:
|
||||
return
|
||||
|
||||
if table_exists(table_name, con):
|
||||
existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
|
||||
merged = pd.concat([existing, dim_df], ignore_index=True)
|
||||
merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
|
||||
merged['id'] = range(len(merged))
|
||||
else:
|
||||
merged = dim_df.copy()
|
||||
|
||||
# Replace table with merged content
|
||||
merged.to_sql(table_name, con, if_exists="replace", index=False)
|
||||
|
||||
if map_df is None or map_df.empty:
|
||||
return
|
||||
|
||||
value_to_id = dict(zip(merged[dim_col], merged['id']))
|
||||
map_df = map_df.copy()
|
||||
map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
|
||||
map_df = map_df[['post_id', mapping_id_col]].dropna()
|
||||
map_df.to_sql(mapping_table, con, if_exists="append", index=False)
|
||||
|
||||
|
||||
def download(id: int):
|
||||
if id == 0:
|
||||
return
|
||||
base_url = "https://knack.news/"
|
||||
url = f"{base_url}{id}"
|
||||
res = requests.get(url)
|
||||
|
||||
# make sure we don't dos knack
|
||||
time.sleep(2)
|
||||
|
||||
if not (200 <= res.status_code <= 300):
|
||||
return
|
||||
|
||||
logger.debug("Found promising page with id %d!", id)
|
||||
|
||||
content = res.content
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
pC = soup.find("div", {"class": "postContent"})
|
||||
|
||||
if pC is None:
|
||||
# not a normal post
|
||||
logger.debug(
|
||||
"Page with id %d does not have a .pageContent-div. Skipping for now.", id
|
||||
)
|
||||
return
|
||||
|
||||
# every post has these fields
|
||||
title = pC.find("h3", {"class": "postTitle"}).text
|
||||
postText = pC.find("div", {"class": "postText"})
|
||||
|
||||
# these fields are possible but not required
|
||||
# TODO: cleanup
|
||||
try:
|
||||
date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
|
||||
day = int(date_parts[0][:-1])
|
||||
months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
|
||||
month = months[date_parts[1]]
|
||||
year = int(date_parts[2])
|
||||
parsed_date = datetime(year, month, day)
|
||||
except Exception:
|
||||
parsed_date = None
|
||||
|
||||
try:
|
||||
author = pC.find("span", {"class": "author"}).text
|
||||
except AttributeError:
|
||||
author = None
|
||||
|
||||
try:
|
||||
category = pC.find("span", {"class": "categoryInfo"}).find_all()
|
||||
category = [c.text for c in category if c.text != 'Alle Artikel']
|
||||
category = ";".join(category)
|
||||
except AttributeError:
|
||||
category = None
|
||||
|
||||
try:
|
||||
tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
|
||||
tags = ";".join(tags)
|
||||
except AttributeError:
|
||||
tags = None
|
||||
|
||||
img = pC.find("img", {"class": "postImage"})
|
||||
if img is not None:
|
||||
img = img["src"]
|
||||
|
||||
res_dict = {
|
||||
"id": id,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"date": parsed_date,
|
||||
"category": category,
|
||||
"url": url,
|
||||
"img_link": img,
|
||||
"tags": tags,
|
||||
"text": postText.text,
|
||||
"html": str(postText),
|
||||
"scraped_at": datetime.now(),
|
||||
"is_cleaned": False
|
||||
}
|
||||
|
||||
return res_dict
|
||||
|
||||
|
||||
def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
|
||||
res = []
|
||||
|
||||
logger.info(
|
||||
"Started parallel scrape of posts from id %d to id %d using %d threads.",
|
||||
min_id,
|
||||
max_id - 1,
|
||||
num_threads,
|
||||
)
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# Use a list comprehension to create a list of futures
|
||||
futures = [executor.submit(download, i) for i in range(min_id, max_id)]
|
||||
|
||||
for future in futures:
|
||||
post = future.result()
|
||||
if post is not None:
|
||||
res.append(post)
|
||||
|
||||
postdf = pd.DataFrame(res)
|
||||
return postdf
|
||||
|
||||
|
||||
def main():
|
||||
num_threads = int(os.environ.get("NUM_THREADS", 8))
|
||||
n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
|
||||
database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
|
||||
|
||||
logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
|
||||
|
||||
con = sqlite3.connect(database_location)
|
||||
with con:
|
||||
if table_exists("posts", con):
|
||||
logger.info("found posts retrieved earlier")
|
||||
max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
|
||||
logger.info("Got max id %d!", max_id_in_db)
|
||||
else:
|
||||
logger.info("no posts scraped so far - starting from 0")
|
||||
max_id_in_db = -1
|
||||
|
||||
postdf = run_downloads(
|
||||
min_id=max_id_in_db + 1,
|
||||
max_id=max_id_in_db + n_scrapes,
|
||||
num_threads=num_threads,
|
||||
)
|
||||
|
||||
# Drop category and tags columns as they're stored in separate tables
|
||||
postdf = postdf.drop(columns=['category', 'tags'])
|
||||
postdf.to_sql("posts", con, if_exists="append", index=False)
|
||||
|
||||
# Tags
|
||||
tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
|
||||
store_dimension_and_mapping(
|
||||
con,
|
||||
tag_dim,
|
||||
tag_map,
|
||||
table_name="tags",
|
||||
dim_col="tag",
|
||||
mapping_table="posttags",
|
||||
mapping_id_col="tag_id",
|
||||
)
|
||||
|
||||
# Categories
|
||||
category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
|
||||
store_dimension_and_mapping(
|
||||
con,
|
||||
category_dim,
|
||||
category_map,
|
||||
table_name="categories",
|
||||
dim_col="category",
|
||||
mapping_table="postcategories",
|
||||
mapping_id_col="category_id",
|
||||
)
|
||||
|
||||
logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
pandas
|
||||
requests
|
||||
bs4
|
||||
dotenv
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
LOGGING_LEVEL=INFO
|
||||
DB_PATH=/data/knack.sqlite
|
||||
MAX_CLEANED_POSTS=1000
|
||||
COMPUTE_DEVICE=mps
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
RUN mkdir -p /app /data /models
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
g++ \
|
||||
gfortran \
|
||||
libopenblas-dev \
|
||||
liblapack-dev \
|
||||
pkg-config \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1
|
||||
ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1
|
||||
|
||||
ENV GTE_MODEL_ID=thenlper/gte-large
|
||||
ENV GTE_MODEL_PATH=/models/thenlper/gte-large
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY .env .
|
||||
|
||||
RUN apt update -y
|
||||
RUN apt install -y cron locales
|
||||
|
||||
# Ensure GLiNER helper scripts are available
|
||||
COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh
|
||||
# Ensure GTE helper scripts are available
|
||||
COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh
|
||||
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh
|
||||
|
||||
COPY *.py .
|
||||
|
||||
# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
|
||||
# Testing every 30 Minutes */30 * * * *
|
||||
RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
|
||||
RUN chmod 0644 /etc/cron.d/knack-transform
|
||||
RUN crontab /etc/cron.d/knack-transform
|
||||
|
||||
# Persist models between container runs
|
||||
VOLUME /models
|
||||
|
||||
CMD ["/usr/local/bin/entrypoint.sh"]
|
||||
#CMD ["python", "main.py"]
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
# Knack Transform
|
||||
|
||||
Data transformation pipeline for the Knack scraper project.
|
||||
|
||||
## Overview
|
||||
|
||||
This folder contains the transformation logic that processes data from the SQLite database. It runs on a scheduled basis (every weekend) via cron.
|
||||
|
||||
The pipeline supports **parallel execution** of independent transform nodes, allowing you to leverage multi-core processors for faster data transformation.
|
||||
|
||||
## Structure
|
||||
|
||||
- `base.py` - Abstract base class for transform nodes
|
||||
- `pipeline.py` - Parallel pipeline orchestration system
|
||||
- `main.py` - Main entry point and pipeline execution
|
||||
- `author_node.py` - NER-based author classification node
|
||||
- `example_node.py` - Template for creating new nodes
|
||||
- `Dockerfile` - Docker image configuration with cron setup
|
||||
- `requirements.txt` - Python dependencies
|
||||
|
||||
## Transform Nodes
|
||||
|
||||
Transform nodes inherit from `TransformNode` and implement the `run` method:
|
||||
|
||||
```python
|
||||
from base import TransformNode, TransformContext
|
||||
import sqlite3
|
||||
|
||||
class MyTransform(TransformNode):
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
df = context.get_dataframe()
|
||||
|
||||
# Transform logic here
|
||||
transformed_df = df.copy()
|
||||
# ... your transformations ...
|
||||
|
||||
# Optionally write back to database
|
||||
transformed_df.to_sql("my_table", con, if_exists="replace", index=False)
|
||||
|
||||
return TransformContext(transformed_df)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Copy `.env.example` to `.env` and configure:
|
||||
|
||||
- `LOGGING_LEVEL` - Log level (INFO or DEBUG)
|
||||
- `DB_PATH` - Path to SQLite database
|
||||
|
||||
## Running
|
||||
|
||||
### With Docker
|
||||
|
||||
```bash
|
||||
docker build -t knack-transform .
|
||||
docker run -v $(pwd)/data:/data knack-transform
|
||||
```
|
||||
|
||||
### Locally
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Cron Schedule
|
||||
|
||||
The Docker container runs the transform pipeline every Sunday at 3 AM.
|
||||
|
|
@ -1,303 +0,0 @@
|
|||
2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:11:40,254 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
|
||||
0 0 41 Über uns None ... 0 0.0 0.0 0.0
|
||||
1 1 52 Kontakt None ... 0 0.0 0.0 0.0
|
||||
2 2 99 Safety First None ... 0 0.0 0.0 0.0
|
||||
3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... chakalaka_161 ... 0 0.0 0.0 0.0
|
||||
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... anonym ... 0 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ... ... ...
|
||||
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... Soli Antifa Ost ... 0 0.0 0.0 0.0
|
||||
96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... anonym ... 0 0.0 0.0 0.0
|
||||
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... anonym ... 0 0.0 0.0 0.0
|
||||
98 13 654 Nach der Demo ging’s bergab kreuzer online ... 0 0.0 0.0 0.0
|
||||
99 14 659 Polizistin unterhält romantische Brieffreundsc... Kira Ayyadi ... 0 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - index id title ... umap_x umap_y row
|
||||
0 0 41 Über uns ... 0.0 0.0 0.0
|
||||
1 1 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 2 99 Safety First ... 0.0 0.0 0.0
|
||||
3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... ... 0.0 0.0 0.0
|
||||
96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... ... 0.0 0.0 0.0
|
||||
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... ... 0.0 0.0 0.0
|
||||
98 13 654 Nach der Demo ging’s bergab ... 0.0 0.0 0.0
|
||||
99 14 659 Polizistin unterhält romantische Brieffreundsc... ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:15:27,968 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
|
||||
0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... LVZ ... 0 0.0 0.0 0.0
|
||||
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... Michael Freitag ... 0 0.0 0.0 0.0
|
||||
2 17 680 Kein Verdacht Konrad Litschko & Andreas Speit ... 0 0.0 0.0 0.0
|
||||
3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... LeipzigBesetzen ... 0 0.0 0.0 0.0
|
||||
4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... interkiezionale ... 0 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ... ... ...
|
||||
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... G19 und BikeKitchen Freiburg ... 0 0.0 0.0 0.0
|
||||
96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... MDR ... 0 0.0 0.0 0.0
|
||||
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
|
||||
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
|
||||
99 36 1154 23 Thesen über die Revolte – Wie können wir au... anonyme*r Mensch aus Leipzig ... 0 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - index id title ... umap_x umap_y row
|
||||
0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... ... 0.0 0.0 0.0
|
||||
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... ... 0.0 0.0 0.0
|
||||
2 17 680 Kein Verdacht ... 0.0 0.0 0.0
|
||||
3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... ... 0.0 0.0 0.0
|
||||
4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... ... 0.0 0.0 0.0
|
||||
96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... ... 0.0 0.0 0.0
|
||||
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... ... 0.0 0.0 0.0
|
||||
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... ... 0.0 0.0 0.0
|
||||
99 36 1154 23 Thesen über die Revolte – Wie können wir au... ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
|
||||
2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:26:34,426 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
|
||||
2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 5.537961 3.468988 3.757369
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 4.980662 1.629360 3.269084
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 1.055900 2.460792 2.076612
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 4.128685 5.247468 4.904186
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 5.383136 2.068369 4.368077
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 5.897925 5.151130 3.241154
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 2.919075 5.341392 4.516587
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 4.852142 1.179675 4.241960
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 5.231822 4.983705 3.941314
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.999596 1.613693 2.039646
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:28:43,420 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 0.0 0.0 0.0
|
||||
1 2 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 3 99 Safety First ... 0.0 0.0 0.0
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results
|
||||
2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:30:42,374 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 0.0 0.0 0.0
|
||||
1 2 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 3 99 Safety First ... 0.0 0.0 0.0
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows
|
||||
2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192)
|
||||
2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3)
|
||||
2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results
|
||||
2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully
|
||||
2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete
|
||||
2026-01-18 15:31:00,255 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:35:37,186 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:36:25,468 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:37:37,881 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:39:23,498 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:39:52,241 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed
|
||||
|
|
@ -1,469 +0,0 @@
|
|||
"""Author classification transform node using NER."""
|
||||
import os
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
import fuzzysearch
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
import torch
|
||||
GLINER_AVAILABLE = True
|
||||
except ImportError:
|
||||
GLINER_AVAILABLE = False
|
||||
logging.warning("GLiNER not available. Install with: pip install gliner")
|
||||
|
||||
class NerAuthorNode(TransformNode):
|
||||
"""Transform node that extracts and classifies authors using NER.
|
||||
|
||||
Creates two tables:
|
||||
- authors: stores unique authors with their type (Person, Organisation, etc.)
|
||||
- post_authors: maps posts to their authors
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "urchade/gliner_multi-v2.1",
|
||||
model_path: str = None,
|
||||
threshold: float = 0.5,
|
||||
max_workers: int = 64,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the AuthorNode.
|
||||
|
||||
Args:
|
||||
model_name: GLiNER model to use
|
||||
model_path: Optional local path to a downloaded GLiNER model
|
||||
threshold: Confidence threshold for entity predictions
|
||||
max_workers: Number of parallel workers for prediction
|
||||
device: Device to run model on ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.model_path = model_path or os.environ.get('GLINER_MODEL_PATH')
|
||||
self.threshold = threshold
|
||||
self.max_workers = max_workers
|
||||
self.device = device
|
||||
self.model = None
|
||||
self.labels = ["Person", "Organisation", "Email", "Newspaper", "NGO"]
|
||||
|
||||
def _setup_model(self):
|
||||
"""Initialize the NER model."""
|
||||
if not GLINER_AVAILABLE:
|
||||
raise ImportError("GLiNER is required for AuthorNode. Install with: pip install gliner")
|
||||
|
||||
model_source = None
|
||||
if self.model_path:
|
||||
if os.path.exists(self.model_path):
|
||||
model_source = self.model_path
|
||||
logger.info(f"Loading GLiNER model from local path: {self.model_path}")
|
||||
else:
|
||||
logger.warning(f"GLINER_MODEL_PATH '{self.model_path}' not found; falling back to hub model {self.model_name}")
|
||||
|
||||
if model_source is None:
|
||||
model_source = self.model_name
|
||||
logger.info(f"Loading GLiNER model from hub: {self.model_name}")
|
||||
|
||||
if self.device == "cuda" and torch.cuda.is_available():
|
||||
self.model = GLiNER.from_pretrained(
|
||||
model_source,
|
||||
max_length=255
|
||||
).to('cuda', dtype=torch.float16)
|
||||
elif self.device == "mps" and torch.backends.mps.is_available():
|
||||
self.model = GLiNER.from_pretrained(
|
||||
model_source,
|
||||
max_length=255
|
||||
).to('mps', dtype=torch.float16)
|
||||
else:
|
||||
self.model = GLiNER.from_pretrained(
|
||||
model_source,
|
||||
max_length=255
|
||||
)
|
||||
|
||||
logger.info("Model loaded successfully")
|
||||
|
||||
def _predict(self, text_data: dict):
|
||||
"""Predict entities for a single author text.
|
||||
|
||||
Args:
|
||||
text_data: Dict with 'author' and 'id' keys
|
||||
|
||||
Returns:
|
||||
Tuple of (predictions, post_id) or None
|
||||
"""
|
||||
if text_data is None or text_data.get('author') is None:
|
||||
return None
|
||||
|
||||
predictions = self.model.predict_entities(
|
||||
text_data['author'],
|
||||
self.labels,
|
||||
threshold=self.threshold
|
||||
)
|
||||
return predictions, text_data['id']
|
||||
|
||||
def _classify_authors(self, posts_df: pd.DataFrame):
|
||||
"""Classify all authors in the posts dataframe.
|
||||
|
||||
Args:
|
||||
posts_df: DataFrame with 'id' and 'author' columns
|
||||
|
||||
Returns:
|
||||
List of dicts with 'text', 'label', 'id' keys
|
||||
"""
|
||||
if self.model is None:
|
||||
self._setup_model()
|
||||
|
||||
# Prepare input data
|
||||
authors_data = []
|
||||
for idx, row in posts_df.iterrows():
|
||||
if pd.notna(row['author']):
|
||||
authors_data.append({
|
||||
'author': row['author'],
|
||||
'id': row['id']
|
||||
})
|
||||
|
||||
logger.info(f"Classifying {len(authors_data)} authors")
|
||||
|
||||
results = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
futures = [executor.submit(self._predict, data) for data in authors_data]
|
||||
|
||||
for future in futures:
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
predictions, post_id = result
|
||||
for pred in predictions:
|
||||
results.append({
|
||||
'text': pred['text'],
|
||||
'label': pred['label'],
|
||||
'id': post_id
|
||||
})
|
||||
|
||||
logger.info(f"Classification complete. Found {len(results)} author entities")
|
||||
return results
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create authors and post_authors tables if they don't exist."""
|
||||
logger.info("Creating authors tables")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS authors (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
type TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS post_authors (
|
||||
post_id INTEGER,
|
||||
author_id INTEGER,
|
||||
PRIMARY KEY (post_id, author_id),
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id),
|
||||
FOREIGN KEY (author_id) REFERENCES authors(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _store_authors(self, con: sqlite3.Connection, results: list):
|
||||
"""Store classified authors and their mappings.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
results: List of classification results
|
||||
"""
|
||||
if not results:
|
||||
logger.info("No authors to store")
|
||||
return
|
||||
|
||||
# Convert results to DataFrame
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
# Get unique authors with their types
|
||||
unique_authors = results_df[['text', 'label']].drop_duplicates()
|
||||
unique_authors.columns = ['name', 'type']
|
||||
|
||||
# Get existing authors
|
||||
existing_authors = pd.read_sql("SELECT id, name FROM authors", con)
|
||||
|
||||
# Find new authors to insert
|
||||
if not existing_authors.empty:
|
||||
new_authors = unique_authors[~unique_authors['name'].isin(existing_authors['name'])]
|
||||
else:
|
||||
new_authors = unique_authors
|
||||
|
||||
if not new_authors.empty:
|
||||
logger.info(f"Inserting {len(new_authors)} new authors")
|
||||
new_authors.to_sql('authors', con, if_exists='append', index=False)
|
||||
|
||||
# Get all authors with their IDs
|
||||
all_authors = pd.read_sql("SELECT id, name FROM authors", con)
|
||||
name_to_id = dict(zip(all_authors['name'], all_authors['id']))
|
||||
|
||||
# Create post_authors mappings
|
||||
mappings = []
|
||||
for _, row in results_df.iterrows():
|
||||
author_id = name_to_id.get(row['text'])
|
||||
if author_id:
|
||||
mappings.append({
|
||||
'post_id': row['id'],
|
||||
'author_id': author_id
|
||||
})
|
||||
|
||||
if mappings:
|
||||
mappings_df = pd.DataFrame(mappings).drop_duplicates()
|
||||
|
||||
# Clear existing mappings for these posts (optional, depends on your strategy)
|
||||
# post_ids = tuple(mappings_df['post_id'].unique())
|
||||
# con.execute(f"DELETE FROM post_authors WHERE post_id IN ({','.join('?' * len(post_ids))})", post_ids)
|
||||
|
||||
logger.info(f"Creating {len(mappings_df)} post-author mappings")
|
||||
mappings_df.to_sql('post_authors', con, if_exists='append', index=False)
|
||||
|
||||
con.commit()
|
||||
logger.info("Authors and mappings stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the author classification transformation.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing posts dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with classified authors dataframe
|
||||
"""
|
||||
logger.info("Starting AuthorNode transformation")
|
||||
|
||||
posts_df = context.get_dataframe()
|
||||
|
||||
# Ensure required columns exist
|
||||
if 'author' not in posts_df.columns:
|
||||
logger.warning("No 'author' column in dataframe. Skipping AuthorNode.")
|
||||
return context
|
||||
|
||||
# Create tables
|
||||
self._create_tables(con)
|
||||
|
||||
# Classify authors
|
||||
results = self._classify_authors(posts_df)
|
||||
|
||||
# Store results
|
||||
self._store_authors(con, results)
|
||||
|
||||
# Return context with results
|
||||
logger.info("AuthorNode transformation complete")
|
||||
|
||||
return TransformContext(posts_df)
|
||||
|
||||
|
||||
class FuzzyAuthorNode(TransformNode):
|
||||
"""FuzzyAuthorNode
|
||||
|
||||
This Node takes in data and rules of authornames that have been classified already
|
||||
and uses those 'rule' to find more similar fields.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_l_dist: int = 1,):
|
||||
"""Initialize FuzzyAuthorNode.
|
||||
|
||||
Args:
|
||||
max_l_dist: The number of 'errors' that are allowed by the fuzzy search algorithm
|
||||
"""
|
||||
self.max_l_dist = max_l_dist
|
||||
logger.info(f"Initialized FuzzyAuthorNode with max_l_dist={max_l_dist}")
|
||||
|
||||
def _process_data(self, con: sqlite3.Connection, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
This is where your main transformation logic goes.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
# Retrieve all known authors from the authors table as 'rules'
|
||||
authors_df = pd.read_sql("SELECT id, name FROM authors", con)
|
||||
|
||||
if authors_df.empty:
|
||||
logger.warning("No authors found in database for fuzzy matching")
|
||||
return pd.DataFrame(columns=['post_id', 'author_id'])
|
||||
|
||||
# Get existing post-author mappings to avoid duplicates
|
||||
existing_mappings = pd.read_sql(
|
||||
"SELECT post_id, author_id FROM post_authors", con
|
||||
)
|
||||
existing_post_ids = set(existing_mappings['post_id'].unique())
|
||||
|
||||
logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
|
||||
logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
|
||||
|
||||
# Filter to posts without author mappings and with non-null author field
|
||||
if 'author' not in df.columns or 'id' not in df.columns:
|
||||
logger.warning("Missing 'author' or 'id' column in input dataframe")
|
||||
return pd.DataFrame(columns=['post_id', 'author_id'])
|
||||
|
||||
posts_to_process = df[
|
||||
(df['id'].notna()) &
|
||||
(df['author'].notna()) &
|
||||
(~df['id'].isin(existing_post_ids))
|
||||
]
|
||||
|
||||
logger.info(f"Processing {len(posts_to_process)} posts for fuzzy matching")
|
||||
|
||||
# Perform fuzzy matching
|
||||
mappings = []
|
||||
for _, post_row in posts_to_process.iterrows():
|
||||
post_id = post_row['id']
|
||||
post_author = str(post_row['author'])
|
||||
|
||||
# Try to find matches against all known author names
|
||||
for _, author_row in authors_df.iterrows():
|
||||
author_id = author_row['id']
|
||||
author_name = str(author_row['name'])
|
||||
# for author names < than 2 characters I want a fault tolerance of 0!
|
||||
l_dist = self.max_l_dist if len(author_name) > 2 else 0
|
||||
|
||||
# Use fuzzysearch to find matches with allowed errors
|
||||
matches = fuzzysearch.find_near_matches(
|
||||
author_name,
|
||||
post_author,
|
||||
max_l_dist=l_dist,
|
||||
)
|
||||
|
||||
if matches:
|
||||
logger.debug(f"Found fuzzy match: '{author_name}' in '{post_author}' for post {post_id}")
|
||||
mappings.append({
|
||||
'post_id': post_id,
|
||||
'author_id': author_id
|
||||
})
|
||||
# Only take the first match per post to avoid multiple mappings
|
||||
break
|
||||
|
||||
# Create result dataframe
|
||||
result_df = pd.DataFrame(mappings, columns=['post_id', 'author_id']) if mappings else pd.DataFrame(columns=['post_id', 'author_id'])
|
||||
|
||||
logger.info(f"Processing complete. Found {len(result_df)} fuzzy matches")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store results back to the database.
|
||||
|
||||
Uses INSERT OR IGNORE to avoid inserting duplicates.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Processed dataframe to store
|
||||
"""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Use INSERT OR IGNORE to handle duplicates (respects PRIMARY KEY constraint)
|
||||
cursor = con.cursor()
|
||||
inserted_count = 0
|
||||
|
||||
for _, row in df.iterrows():
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO post_authors (post_id, author_id) VALUES (?, ?)",
|
||||
(int(row['post_id']), int(row['author_id']))
|
||||
)
|
||||
if cursor.rowcount > 0:
|
||||
inserted_count += 1
|
||||
|
||||
con.commit()
|
||||
logger.info(f"Results stored successfully. Inserted {inserted_count} new mappings, skipped {len(df) - inserted_count} duplicates")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting FuzzyAuthorNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to FuzzyAuthorNode")
|
||||
return context
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(con, input_df)
|
||||
|
||||
# Store results
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("FuzzyAuthorNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(input_df)
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
# Connect to database
|
||||
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
try:
|
||||
# Read posts from database
|
||||
df = pd.read_sql('SELECT * FROM posts;', con)
|
||||
logger.info(f"Loaded {len(df)} posts from database")
|
||||
|
||||
# Create context
|
||||
context = TransformContext(df)
|
||||
|
||||
# Run NerAuthorNode
|
||||
logger.info("Running NerAuthorNode...")
|
||||
ner_node = NerAuthorNode(device="mps")
|
||||
context = ner_node.run(con, context)
|
||||
logger.info("NerAuthorNode complete")
|
||||
|
||||
# Run FuzzyAuthorNode
|
||||
logger.info("Running FuzzyAuthorNode...")
|
||||
fuzzy_node = FuzzyAuthorNode(max_l_dist=1)
|
||||
context = fuzzy_node.run(con, context)
|
||||
logger.info("FuzzyAuthorNode complete")
|
||||
|
||||
logger.info("All author nodes completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,545 +0,0 @@
|
|||
"""Classes of Transformernodes that have to do with
|
||||
text processing.
|
||||
|
||||
- TextEmbeddingNode calculates text embeddings
|
||||
- UmapNode calculates xy coordinates on those vector embeddings
|
||||
- SimilarityNode calculates top n similar posts based on those embeddings
|
||||
using the spectral distance.
|
||||
"""
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
import os
|
||||
import numpy as np
|
||||
import sys
|
||||
import pickle
|
||||
import matplotlib.pyplot as plt
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
GTE_AVAILABLE = True
|
||||
except ImportError:
|
||||
GTE_AVAILABLE = False
|
||||
logging.warning("GTE not available. Install with pip!")
|
||||
|
||||
try:
|
||||
import umap
|
||||
UMAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
UMAP_AVAILABLE = False
|
||||
logging.warning("UMAP not available. Install with pip install umap-learn!")
|
||||
|
||||
class TextEmbeddingNode(TransformNode):
|
||||
"""Calculates vector embeddings based on a dataframe
|
||||
of posts.
|
||||
"""
|
||||
def __init__(self,
|
||||
model_name: str = "thenlper/gte-large",
|
||||
model_path: str = None,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the ExampleNode.
|
||||
|
||||
Args:
|
||||
model_name: Name of the ML Model to calculate text embeddings
|
||||
model_path: Optional local path to a downloaded embedding model
|
||||
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
|
||||
self.device = device
|
||||
self.model = None
|
||||
logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
|
||||
|
||||
def _setup_model(self):
|
||||
"""Init the Text Embedding Model."""
|
||||
if not GTE_AVAILABLE:
|
||||
raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
|
||||
|
||||
model_source = None
|
||||
if self.model_path:
|
||||
if os.path.exists(self.model_path):
|
||||
# Check if it's a valid model directory
|
||||
if os.path.exists(os.path.join(self.model_path, 'config.json')):
|
||||
model_source = self.model_path
|
||||
logger.info(f"Loading GTE model from local path: {self.model_path}")
|
||||
else:
|
||||
logger.warning(f"GTE_MODEL_PATH '{self.model_path}' found but missing config.json; Falling back to hub model {self.model_name}")
|
||||
else:
|
||||
logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
|
||||
|
||||
if model_source is None:
|
||||
model_source = self.model_name
|
||||
logger.info(f"Loading GTE model from the hub: {self.model_name}")
|
||||
|
||||
try:
|
||||
if self.device == "cuda" and torch.cuda.is_available():
|
||||
self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
|
||||
elif self.device == "mps" and torch.backends.mps.is_available():
|
||||
self.model = SentenceTransformer(model_source).to('mps', dtype=torch.float16)
|
||||
else:
|
||||
self.model = SentenceTransformer(model_source)
|
||||
logger.info(f"Successfully loaded GTE model from: {model_source}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load GTE model from {model_source}: {e}")
|
||||
raise
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
Calculates an embedding as a np.array.
|
||||
Also pickles that array to prepare it to
|
||||
storage in the database.
|
||||
|
||||
Args:
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
if self.model is None:
|
||||
self._setup_model()
|
||||
|
||||
# Example: Add a new column based on existing data
|
||||
result_df = df.copy()
|
||||
|
||||
result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store results back to the database using batch updates."""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Convert numpy arrays to bytes for BLOB storage
|
||||
updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
|
||||
con.executemany(
|
||||
"UPDATE posts SET embedding = ? WHERE id = ?",
|
||||
updates
|
||||
)
|
||||
|
||||
con.commit()
|
||||
logger.info("Results stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting TextEmbeddingNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to TextEmbeddingNdode")
|
||||
return context
|
||||
|
||||
if 'text' not in input_df.columns:
|
||||
logger.warning("No 'text' column in context dataframe. Skipping TextEmbeddingNode")
|
||||
return context
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(input_df)
|
||||
|
||||
# Store results (optional)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("TextEmbeddingNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
|
||||
class UmapNode(TransformNode):
|
||||
"""Calculates 2D coordinates from embeddings using UMAP dimensionality reduction.
|
||||
|
||||
This node takes text embeddings and reduces them to 2D coordinates
|
||||
for visualization purposes.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_neighbors: int = 10,
|
||||
min_dist: float = 0.1,
|
||||
n_components: int = 3,
|
||||
metric: str = "cosine",
|
||||
random_state: int = 42,
|
||||
model_path: str = None):
|
||||
"""Initialize the UmapNode.
|
||||
|
||||
Args:
|
||||
n_neighbors: Number of neighbors to consider for UMAP (default: 15)
|
||||
min_dist: Minimum distance between points in low-dimensional space (default: 0.1)
|
||||
n_components: Number of dimensions to reduce to (default: 2)
|
||||
metric: Distance metric to use (default: 'cosine')
|
||||
random_state: Random seed for reproducibility (default: 42)
|
||||
model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
|
||||
"""
|
||||
self.n_neighbors = n_neighbors
|
||||
self.min_dist = min_dist
|
||||
self.n_components = n_components
|
||||
self.metric = metric
|
||||
self.random_state = random_state
|
||||
self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
|
||||
self.reducer = None
|
||||
logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
|
||||
f"n_components={n_components}, metric={metric}, random_state={random_state}, "
|
||||
f"model_path={self.model_path}")
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
Retrieves embeddings from BLOB storage, converts them back to numpy arrays,
|
||||
and applies UMAP dimensionality reduction to create 2D coordinates.
|
||||
|
||||
Args:
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe with umap_x and umap_y columns
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
if not UMAP_AVAILABLE:
|
||||
raise ImportError("UMAP is required for UmapNode. Install with: pip install umap-learn")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
# Convert BLOB embeddings back to numpy arrays
|
||||
if 'embedding' not in result_df.columns:
|
||||
logger.error("No 'embedding' column found in dataframe")
|
||||
raise ValueError("Input dataframe must contain 'embedding' column")
|
||||
|
||||
logger.info("Converting embeddings from BLOB to numpy arrays")
|
||||
result_df['embedding'] = result_df['embedding'].apply(
|
||||
lambda x: np.frombuffer(x, dtype=np.float32) if x is not None else None
|
||||
)
|
||||
|
||||
# Filter out rows with None embeddings
|
||||
valid_rows = result_df['embedding'].notna()
|
||||
if not valid_rows.any():
|
||||
logger.error("No valid embeddings found in dataframe")
|
||||
raise ValueError("No valid embeddings to process")
|
||||
|
||||
logger.info(f"Found {valid_rows.sum()} valid embeddings out of {len(result_df)} rows")
|
||||
|
||||
# Stack embeddings into a matrix
|
||||
embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
|
||||
logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
|
||||
|
||||
# Check if a saved UMAP model exists
|
||||
if self.model_path and os.path.exists(self.model_path):
|
||||
logger.info(f"Loading existing UMAP model from {self.model_path}")
|
||||
try:
|
||||
with open(self.model_path, 'rb') as f:
|
||||
self.reducer = pickle.load(f)
|
||||
logger.info("UMAP model loaded successfully")
|
||||
umap_coords = self.reducer.transform(embeddings_matrix)
|
||||
logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
|
||||
logger.info("Falling back to fitting a new model")
|
||||
self.reducer = None
|
||||
|
||||
# If no saved model or loading failed, fit a new model
|
||||
if self.reducer is None:
|
||||
logger.info("Fitting new UMAP reducer...")
|
||||
self.reducer = umap.UMAP(
|
||||
n_neighbors=self.n_neighbors,
|
||||
min_dist=self.min_dist,
|
||||
n_components=self.n_components,
|
||||
metric=self.metric,
|
||||
random_state=self.random_state
|
||||
)
|
||||
|
||||
umap_coords = self.reducer.fit_transform(embeddings_matrix)
|
||||
logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
|
||||
|
||||
# Save the fitted model
|
||||
try:
|
||||
umap_folder = '/'.join(self.model_path.split('/')[:1])
|
||||
os.mkdir(umap_folder)
|
||||
with open(self.model_path, 'wb') as f:
|
||||
pickle.dump(self.reducer, f)
|
||||
logger.info(f"UMAP model saved to {self.model_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
|
||||
|
||||
# Add UMAP coordinates to dataframe
|
||||
result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
|
||||
result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
|
||||
result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
|
||||
|
||||
# Fill NaN for invalid rows
|
||||
result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
|
||||
result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
|
||||
result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store UMAP coordinates back to the database.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Processed dataframe with umap_x and umap_y columns
|
||||
"""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Batch update UMAP coordinates
|
||||
updates = [
|
||||
(row['umap_x'], row['umap_y'], row['umap_z'], row['id'])
|
||||
for _, row in df.iterrows()
|
||||
if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
|
||||
]
|
||||
|
||||
if updates:
|
||||
con.executemany(
|
||||
"UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
|
||||
updates
|
||||
)
|
||||
con.commit()
|
||||
logger.info(f"Stored {len(updates)} UMAP coordinate pairs successfully")
|
||||
else:
|
||||
logger.warning("No valid UMAP coordinates to store")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting ExampleNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to ExampleNode")
|
||||
return context
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(input_df)
|
||||
|
||||
# Store results (optional)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("ExampleNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
|
||||
class SimilarityNode(TransformNode):
|
||||
"""Example transform node template.
|
||||
|
||||
This node demonstrates the basic structure for creating
|
||||
new transformation nodes in the pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
param1: str = "default_value",
|
||||
param2: int = 42,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the ExampleNode.
|
||||
|
||||
Args:
|
||||
param1: Example string parameter
|
||||
param2: Example integer parameter
|
||||
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.param1 = param1
|
||||
self.param2 = param2
|
||||
self.device = device
|
||||
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create any necessary tables in the database.
|
||||
|
||||
This is optional - only needed if your node creates new tables.
|
||||
"""
|
||||
logger.info("Creating example tables")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS example_results (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
post_id INTEGER,
|
||||
result_value TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
This is where your main transformation logic goes.
|
||||
|
||||
Args:
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
# Example: Add a new column based on existing data
|
||||
result_df = df.copy()
|
||||
result_df['processed'] = True
|
||||
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store results back to the database.
|
||||
|
||||
This is optional - only needed if you want to persist results.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Processed dataframe to store
|
||||
"""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Example: Store to database
|
||||
# df[['post_id', 'result_value']].to_sql(
|
||||
# 'example_results',
|
||||
# con,
|
||||
# if_exists='append',
|
||||
# index=False
|
||||
# )
|
||||
|
||||
con.commit()
|
||||
logger.info("Results stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting ExampleNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to ExampleNode")
|
||||
return context
|
||||
|
||||
# Create any necessary tables
|
||||
self._create_tables(con)
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(input_df)
|
||||
|
||||
# Store results (optional)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("ExampleNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
def main():
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
|
||||
df = pd.read_sql('select * from posts;', con)
|
||||
#node = TextEmbeddingNode(device='mps')
|
||||
#context = TransformContext(df)
|
||||
|
||||
logger.info(df)
|
||||
#new_context = node.run(con, context)
|
||||
#logger.info(new_context.get_dataframe())
|
||||
|
||||
#umapNode = UmapNode()
|
||||
#new_context = umapNode.run(con, new_context)
|
||||
|
||||
#logger.info(new_context.get_dataframe())
|
||||
|
||||
# Create 3D scatter plot of UMAP coordinates
|
||||
result_df = df
|
||||
|
||||
fig = plt.figure(figsize=(12, 9))
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
|
||||
scatter = ax.scatter(
|
||||
result_df['umap_x'],
|
||||
result_df['umap_y'],
|
||||
result_df['umap_z'],
|
||||
c=result_df['id'],
|
||||
cmap='viridis',
|
||||
alpha=0.6,
|
||||
s=50
|
||||
)
|
||||
|
||||
ax.set_xlabel('UMAP X')
|
||||
ax.set_ylabel('UMAP Y')
|
||||
ax.set_zlabel('UMAP Z')
|
||||
ax.set_title('3D UMAP Visualization of Post Embeddings')
|
||||
|
||||
plt.colorbar(scatter, ax=ax, label='Post Index')
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
logger.info("3D plot displayed")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ -d "$GLINER_MODEL_PATH" ] && [ -f "$GLINER_MODEL_PATH/config.json" ]; then
|
||||
echo "GLiNER model already present at $GLINER_MODEL_PATH"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Downloading GLiNER model $GLINER_MODEL_ID to $GLINER_MODEL_PATH"
|
||||
mkdir -p "$GLINER_MODEL_PATH"
|
||||
|
||||
# Use Python with huggingface_hub for reliable model downloading
|
||||
python3 << 'EOF'
|
||||
import os
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
model_id = os.environ.get('GLINER_MODEL_ID')
|
||||
model_path = os.environ.get('GLINER_MODEL_PATH')
|
||||
|
||||
if not model_id or not model_path:
|
||||
raise ValueError(f"GLINER_MODEL_ID and GLINER_MODEL_PATH environment variables must be set")
|
||||
|
||||
try:
|
||||
print(f"Downloading model {model_id} to {model_path}")
|
||||
snapshot_download(
|
||||
repo_id=model_id,
|
||||
cache_dir=None, # Don't use cache, download directly
|
||||
local_dir=model_path,
|
||||
local_dir_use_symlinks=False # Don't use symlinks, copy files
|
||||
)
|
||||
print(f"Successfully downloaded GLiNER model to {model_path}")
|
||||
except Exception as e:
|
||||
print(f"Error downloading GLiNER model: {e}")
|
||||
exit(1)
|
||||
EOF
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ -d "$GTE_MODEL_PATH" ] && [ -f "$GTE_MODEL_PATH/config.json" ]; then
|
||||
echo "GTE model already present at $GTE_MODEL_PATH"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Downloading GTE model $GTE_MODEL_ID to $GTE_MODEL_PATH"
|
||||
mkdir -p "$GTE_MODEL_PATH"
|
||||
|
||||
# Use Python with huggingface_hub for reliable model downloading
|
||||
python3 << 'EOF'
|
||||
import os
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
model_id = os.environ.get('GTE_MODEL_ID')
|
||||
model_path = os.environ.get('GTE_MODEL_PATH')
|
||||
|
||||
if not model_id or not model_path:
|
||||
raise ValueError(f"GTE_MODEL_ID and GTE_MODEL_PATH environment variables must be set")
|
||||
|
||||
try:
|
||||
print(f"Downloading model {model_id} to {model_path}")
|
||||
snapshot_download(
|
||||
repo_id=model_id,
|
||||
cache_dir=None, # Don't use cache, download directly
|
||||
local_dir=model_path,
|
||||
local_dir_use_symlinks=False # Don't use symlinks, copy files
|
||||
)
|
||||
print(f"Successfully downloaded GTE model to {model_path}")
|
||||
except Exception as e:
|
||||
print(f"Error downloading GTE model: {e}")
|
||||
exit(1)
|
||||
EOF
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Run model download with output to stdout/stderr
|
||||
/usr/local/bin/ensure_gte_model.sh 2>&1
|
||||
/usr/local/bin/ensure_gliner_model.sh 2>&1
|
||||
|
||||
# Start cron in foreground with logging
|
||||
exec cron -f -L 2
|
||||
# cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1
|
||||
|
|
@ -1,170 +0,0 @@
|
|||
"""Example template node for the transform pipeline.
|
||||
|
||||
This is a template showing how to create new transform nodes.
|
||||
Copy this file and modify it for your specific transformation needs.
|
||||
"""
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
class ExampleNode(TransformNode):
|
||||
"""Example transform node template.
|
||||
|
||||
This node demonstrates the basic structure for creating
|
||||
new transformation nodes in the pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
param1: str = "default_value",
|
||||
param2: int = 42,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the ExampleNode.
|
||||
|
||||
Args:
|
||||
param1: Example string parameter
|
||||
param2: Example integer parameter
|
||||
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.param1 = param1
|
||||
self.param2 = param2
|
||||
self.device = device
|
||||
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create any necessary tables in the database.
|
||||
|
||||
This is optional - only needed if your node creates new tables.
|
||||
"""
|
||||
logger.info("Creating example tables")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS example_results (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
post_id INTEGER,
|
||||
result_value TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
This is where your main transformation logic goes.
|
||||
|
||||
Args:
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
# Example: Add a new column based on existing data
|
||||
result_df = df.copy()
|
||||
result_df['processed'] = True
|
||||
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store results back to the database.
|
||||
|
||||
This is optional - only needed if you want to persist results.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Processed dataframe to store
|
||||
"""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Example: Store to database
|
||||
# df[['post_id', 'result_value']].to_sql(
|
||||
# 'example_results',
|
||||
# con,
|
||||
# if_exists='append',
|
||||
# index=False
|
||||
# )
|
||||
|
||||
con.commit()
|
||||
logger.info("Results stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting ExampleNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to ExampleNode")
|
||||
return context
|
||||
|
||||
# Create any necessary tables
|
||||
self._create_tables(con)
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(input_df)
|
||||
|
||||
# Store results (optional)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("ExampleNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# This allows you to test your node independently
|
||||
import os
|
||||
os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
|
||||
|
||||
from pipeline import TransformContext
|
||||
import sqlite3
|
||||
|
||||
# Create test data
|
||||
test_df = pd.DataFrame({
|
||||
'id': [1, 2, 3],
|
||||
'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
|
||||
})
|
||||
|
||||
# Create test database connection
|
||||
test_con = sqlite3.connect(':memory:')
|
||||
|
||||
# Create and run node
|
||||
node = ExampleNode(param1="test", param2=100)
|
||||
context = TransformContext(test_df)
|
||||
result_context = node.run(test_con, context)
|
||||
|
||||
# Check results
|
||||
result_df = result_context.get_dataframe()
|
||||
print("\nResult DataFrame:")
|
||||
print(result_df)
|
||||
|
||||
test_con.close()
|
||||
print("\n✓ ExampleNode test completed successfully!")
|
||||
|
|
@ -1,147 +0,0 @@
|
|||
#! python3
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||
logging_level = logging.INFO
|
||||
else:
|
||||
logging_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
def setup_database_connection(db_path=None):
|
||||
"""Create connection to the SQLite database."""
|
||||
if db_path is None:
|
||||
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
logger.info(f"Connecting to database: {db_path}")
|
||||
return sqlite3.connect(db_path)
|
||||
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
"""Check if a table exists in the database."""
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
def run_from_database(db_path=None):
|
||||
"""Run the pipeline using database as input and output."""
|
||||
logger.info("Starting transform pipeline (database mode)")
|
||||
|
||||
try:
|
||||
con = setup_database_connection(db_path)
|
||||
logger.info("Database connection established")
|
||||
|
||||
# Check if posts table exists
|
||||
if not table_exists('posts', con):
|
||||
logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
|
||||
logger.info("Transform pipeline skipped - no data available")
|
||||
return
|
||||
|
||||
# Import transform components
|
||||
from pipeline import create_default_pipeline, TransformContext
|
||||
import pandas as pd
|
||||
|
||||
# Load posts data
|
||||
logger.info("Loading posts from database")
|
||||
sql = "SELECT * FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0)"
|
||||
# MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
|
||||
df = pd.read_sql(sql, con)
|
||||
logger.info(f"Loaded {len(df)} uncleaned posts with authors")
|
||||
|
||||
if df.empty:
|
||||
logger.info("No uncleaned posts found. Transform pipeline skipped.")
|
||||
return
|
||||
|
||||
# Create initial context
|
||||
context = TransformContext(df)
|
||||
|
||||
# Create and run parallel pipeline
|
||||
device = os.environ.get('COMPUTE_DEVICE', 'cpu')
|
||||
max_workers = int(os.environ.get('MAX_WORKERS', 4))
|
||||
|
||||
pipeline = create_default_pipeline(device=device, max_workers=max_workers)
|
||||
effective_db_path = db_path or os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
results = pipeline.run(
|
||||
db_path=effective_db_path,
|
||||
initial_context=context,
|
||||
fail_fast=False # Continue even if some nodes fail
|
||||
)
|
||||
|
||||
logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
|
||||
|
||||
# Mark all processed posts as cleaned
|
||||
post_ids = df['id'].tolist()
|
||||
if post_ids:
|
||||
placeholders = ','.join('?' * len(post_ids))
|
||||
con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
|
||||
con.commit()
|
||||
logger.info(f"Marked {len(post_ids)} posts as cleaned")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transform pipeline: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
if 'con' in locals():
|
||||
con.close()
|
||||
logger.info("Database connection closed")
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line argument support."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Transform pipeline for Knack scraper data',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run with database (Docker mode)
|
||||
python main.py
|
||||
|
||||
# Run with custom device and workers
|
||||
python main.py --database /path/to/knack.sqlite --device mps --workers 8
|
||||
|
||||
# Run with specific database file
|
||||
python main.py --database /path/to/knack.sqlite
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--database',
|
||||
help='Path to SQLite database (for database mode). Defaults to DB_PATH env var or /data/knack.sqlite'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
default=os.environ.get('COMPUTE_DEVICE', 'cpu'),
|
||||
choices=['cpu', 'cuda', 'mps'],
|
||||
help='Device to use for compute-intensive operations (default: cpu)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
default=int(os.environ.get('MAX_WORKERS', 4)),
|
||||
help='Maximum number of parallel workers (default: 4)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine mode based on arguments
|
||||
if args.database:
|
||||
# Database mode (original behavior)
|
||||
run_from_database(db_path=args.database)
|
||||
logger.info("Database connection closed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,289 +0,0 @@
|
|||
"""Parallel pipeline orchestration for transform nodes."""
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import pandas as pd
|
||||
import multiprocessing as mp
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
class TransformContext:
|
||||
"""Context object containing the dataframe for transformation."""
|
||||
# Possibly add a dict for the context to give more Information
|
||||
|
||||
def __init__(self, df: pd.DataFrame):
|
||||
self.df = df
|
||||
|
||||
def get_dataframe(self) -> pd.DataFrame:
|
||||
"""Get the pandas dataframe from the context."""
|
||||
return self.df
|
||||
|
||||
class NodeConfig:
|
||||
"""Configuration for a transform node."""
|
||||
|
||||
def __init__(self,
|
||||
node_class: type,
|
||||
node_kwargs: Dict = None,
|
||||
dependencies: List[str] = None,
|
||||
name: str = None):
|
||||
"""Initialize node configuration.
|
||||
|
||||
Args:
|
||||
node_class: The TransformNode class to instantiate
|
||||
node_kwargs: Keyword arguments to pass to node constructor
|
||||
dependencies: List of node names that must complete before this one
|
||||
name: Optional name for the node (defaults to class name)
|
||||
"""
|
||||
self.node_class = node_class
|
||||
self.node_kwargs = node_kwargs or {}
|
||||
self.dependencies = dependencies or []
|
||||
self.name = name or node_class.__name__
|
||||
|
||||
class ParallelPipeline:
|
||||
"""Pipeline for executing transform nodes in parallel where possible.
|
||||
|
||||
The pipeline analyzes dependencies between nodes and executes
|
||||
independent nodes concurrently using multiprocessing or threading.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_workers: Optional[int] = None,
|
||||
use_processes: bool = False):
|
||||
"""Initialize the parallel pipeline.
|
||||
|
||||
Args:
|
||||
max_workers: Maximum number of parallel workers (defaults to CPU count)
|
||||
use_processes: If True, use ProcessPoolExecutor; if False, use ThreadPoolExecutor
|
||||
"""
|
||||
self.max_workers = max_workers or mp.cpu_count()
|
||||
self.use_processes = use_processes
|
||||
self.nodes: Dict[str, NodeConfig] = {}
|
||||
logger.info(f"Initialized ParallelPipeline with {self.max_workers} workers "
|
||||
f"({'processes' if use_processes else 'threads'})")
|
||||
|
||||
def add_node(self, config: NodeConfig):
|
||||
"""Add a node to the pipeline.
|
||||
|
||||
Args:
|
||||
config: NodeConfig with node details and dependencies
|
||||
"""
|
||||
self.nodes[config.name] = config
|
||||
logger.info(f"Added node '{config.name}' with dependencies: {config.dependencies}")
|
||||
|
||||
def _get_execution_stages(self) -> List[List[str]]:
|
||||
"""Determine execution stages based on dependencies.
|
||||
|
||||
Returns:
|
||||
List of stages, where each stage contains node names that can run in parallel
|
||||
"""
|
||||
stages = []
|
||||
completed = set()
|
||||
remaining = set(self.nodes.keys())
|
||||
|
||||
while remaining:
|
||||
# Find nodes whose dependencies are all completed
|
||||
ready = []
|
||||
for node_name in remaining:
|
||||
config = self.nodes[node_name]
|
||||
if all(dep in completed for dep in config.dependencies):
|
||||
ready.append(node_name)
|
||||
|
||||
if not ready:
|
||||
# Circular dependency or missing dependency
|
||||
raise ValueError(f"Cannot resolve dependencies. Remaining nodes: {remaining}")
|
||||
|
||||
stages.append(ready)
|
||||
completed.update(ready)
|
||||
remaining -= set(ready)
|
||||
|
||||
return stages
|
||||
|
||||
def _execute_node(self,
|
||||
node_name: str,
|
||||
db_path: str,
|
||||
context: TransformContext) -> tuple:
|
||||
"""Execute a single node.
|
||||
|
||||
Args:
|
||||
node_name: Name of the node to execute
|
||||
db_path: Path to the SQLite database
|
||||
context: TransformContext for the node
|
||||
|
||||
Returns:
|
||||
Tuple of (node_name, result_context, error)
|
||||
"""
|
||||
try:
|
||||
# Create fresh database connection (not shared across processes/threads)
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
config = self.nodes[node_name]
|
||||
node = config.node_class(**config.node_kwargs)
|
||||
|
||||
logger.info(f"Executing node: {node_name}")
|
||||
result_context = node.run(con, context)
|
||||
|
||||
con.close()
|
||||
logger.info(f"Node '{node_name}' completed successfully")
|
||||
|
||||
return node_name, result_context, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing node '{node_name}': {e}", exc_info=True)
|
||||
return node_name, None, str(e)
|
||||
|
||||
def run(self,
|
||||
db_path: str,
|
||||
initial_context: TransformContext,
|
||||
fail_fast: bool = False) -> Dict[str, TransformContext]:
|
||||
"""Execute the pipeline.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database
|
||||
initial_context: Initial TransformContext for the pipeline
|
||||
fail_fast: If True, stop execution on first error
|
||||
|
||||
Returns:
|
||||
Dict mapping node names to their output TransformContext
|
||||
"""
|
||||
logger.info("Starting parallel pipeline execution")
|
||||
|
||||
stages = self._get_execution_stages()
|
||||
logger.info(f"Pipeline has {len(stages)} execution stage(s)")
|
||||
|
||||
results = {}
|
||||
errors = []
|
||||
|
||||
ExecutorClass = ProcessPoolExecutor if self.use_processes else ThreadPoolExecutor
|
||||
|
||||
for stage_num, stage_nodes in enumerate(stages, 1):
|
||||
logger.info(f"Stage {stage_num}/{len(stages)}: Executing {len(stage_nodes)} node(s) in parallel: {stage_nodes}")
|
||||
|
||||
# For nodes in this stage, use the context from their dependencies
|
||||
# If multiple dependencies, we'll use the most recent one (or could merge)
|
||||
stage_futures = {}
|
||||
|
||||
with ExecutorClass(max_workers=min(self.max_workers, len(stage_nodes))) as executor:
|
||||
for node_name in stage_nodes:
|
||||
config = self.nodes[node_name]
|
||||
|
||||
# Get context from dependencies (use the last dependency's output)
|
||||
if config.dependencies:
|
||||
context = results.get(config.dependencies[-1], initial_context)
|
||||
else:
|
||||
context = initial_context
|
||||
|
||||
future = executor.submit(self._execute_node, node_name, db_path, context)
|
||||
stage_futures[future] = node_name
|
||||
|
||||
# Wait for all nodes in this stage to complete
|
||||
for future in as_completed(stage_futures):
|
||||
node_name = stage_futures[future]
|
||||
name, result_context, error = future.result()
|
||||
|
||||
if error:
|
||||
errors.append((name, error))
|
||||
if fail_fast:
|
||||
logger.error(f"Pipeline failed at node '{name}': {error}")
|
||||
raise RuntimeError(f"Node '{name}' failed: {error}")
|
||||
else:
|
||||
results[name] = result_context
|
||||
|
||||
if errors:
|
||||
logger.warning(f"Pipeline completed with {len(errors)} error(s)")
|
||||
for name, error in errors:
|
||||
logger.error(f" - {name}: {error}")
|
||||
else:
|
||||
logger.info("Pipeline completed successfully")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def create_default_pipeline(device: str = "cpu",
|
||||
max_workers: Optional[int] = None) -> ParallelPipeline:
|
||||
"""Create a pipeline with default transform nodes.
|
||||
|
||||
Args:
|
||||
device: Device to use for compute-intensive nodes ('cpu', 'cuda', 'mps')
|
||||
max_workers: Maximum number of parallel workers
|
||||
|
||||
Returns:
|
||||
Configured ParallelPipeline
|
||||
"""
|
||||
from author_node import NerAuthorNode, FuzzyAuthorNode
|
||||
from embeddings_node import TextEmbeddingNode, UmapNode
|
||||
from url_node import URLNode
|
||||
from to_d3_node import ToD3Node
|
||||
|
||||
pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
|
||||
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=URLNode,
|
||||
dependencies=[],
|
||||
name='URLNode'
|
||||
))
|
||||
|
||||
# Add AuthorNode (no dependencies)
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=NerAuthorNode,
|
||||
node_kwargs={
|
||||
'device': device,
|
||||
'model_path': os.environ.get('GLINER_MODEL_PATH')
|
||||
},
|
||||
dependencies=[],
|
||||
name='AuthorNode'
|
||||
))
|
||||
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=FuzzyAuthorNode,
|
||||
node_kwargs={
|
||||
'max_l_dist': 1
|
||||
},
|
||||
dependencies=['AuthorNode'],
|
||||
name='FuzzyAuthorNode'
|
||||
))
|
||||
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=TextEmbeddingNode,
|
||||
node_kwargs={
|
||||
'device': device,
|
||||
'model_path': os.environ.get('GTE_MODEL_PATH')
|
||||
},
|
||||
dependencies=['AuthorNode'],
|
||||
name='TextEmbeddingNode'
|
||||
))
|
||||
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=UmapNode,
|
||||
node_kwargs={},
|
||||
dependencies=['TextEmbeddingNode'],
|
||||
name='UmapNode'
|
||||
))
|
||||
|
||||
pipeline.add_node(NodeConfig(
|
||||
node_class=ToD3Node,
|
||||
dependencies=[
|
||||
'UmapNode',
|
||||
'TextEmbeddingNode',
|
||||
'FuzzyAuthorNode',
|
||||
'AuthorNode',
|
||||
'URLNode'
|
||||
],
|
||||
node_kwargs={
|
||||
'output_path': './data/json/'
|
||||
},
|
||||
name='ToD3Node'
|
||||
))
|
||||
|
||||
# TODO: Create Node to compute Text Embeddings and UMAP.
|
||||
|
||||
# pipeline.add_node(NodeConfig(
|
||||
# node_class=UMAPNode,
|
||||
# node_kwargs={'device': device},
|
||||
# dependencies=['EmbeddingNode'], # Runs after EmbeddingNode
|
||||
# name='UMAPNode'
|
||||
# ))
|
||||
|
||||
return pipeline
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
pandas
|
||||
python-dotenv
|
||||
gliner
|
||||
torch
|
||||
fuzzysearch
|
||||
sentence_transformers
|
||||
umap-learn
|
||||
matplotlib
|
||||
huggingface_hub
|
||||
|
|
@ -1,102 +0,0 @@
|
|||
"""Node to query data from the database and generate individual json file
|
||||
for visualisations in the d3.js framework"""
|
||||
import sqlite3
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
class ToD3Node(TransformNode):
|
||||
"""Node that takes the data in a sqlite3 database and generates visualisation data
|
||||
as json files in a specific folder.
|
||||
"""
|
||||
|
||||
def __init__(self, output_path: str):
|
||||
self.output_path = output_path
|
||||
self.queries = {
|
||||
'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
|
||||
'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
|
||||
'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
|
||||
'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
|
||||
'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
|
||||
'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
|
||||
'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
|
||||
}
|
||||
super().__init__()
|
||||
logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
|
||||
|
||||
def _query_db(self, con: sqlite3.Connection, query: str):
|
||||
cursor = con.cursor()
|
||||
cursor.execute(query)
|
||||
r = [dict((cursor.description[i][0], value) \
|
||||
for i, value in enumerate(row)) for row in cursor.fetchall()]
|
||||
return r
|
||||
|
||||
def _calculate_files(self, con: sqlite3.Connection):
|
||||
for key in self.queries.keys():
|
||||
q = self._query_db(con, self.queries[key])
|
||||
with open(f'{self.output_path}{key}.json', 'w') as f:
|
||||
f.write(json.dumps(q))
|
||||
|
||||
return len(self.queries.keys())
|
||||
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext):
|
||||
"""Executes the toD3 Node
|
||||
Writes to a bunch of files, each for each query.
|
||||
|
||||
Args:
|
||||
con (sqlite3.Connection): SQLite database connection
|
||||
context (TransformContext): TransformContext, containing the input
|
||||
dataframe of all post.
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe.
|
||||
"""
|
||||
logger.info("Starting ToD3Node transformation")
|
||||
|
||||
if not os.path.isdir(self.output_path):
|
||||
logger.warning(f"output_dir does not exist, creating dir...")
|
||||
os.mkdir(self.output_path)
|
||||
|
||||
count = self._calculate_files(con)
|
||||
|
||||
logger.info(f"Successfully generated {count} json files.")
|
||||
|
||||
return context
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
# Connect to database
|
||||
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
try:
|
||||
context = TransformContext(None)
|
||||
|
||||
node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
|
||||
|
||||
context = node.run(con, context)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
"""Base transform node for data pipeline."""
|
||||
from abc import ABC, abstractmethod
|
||||
import sqlite3
|
||||
|
||||
from pipeline import TransformContext
|
||||
|
||||
class TransformNode(ABC):
|
||||
"""Abstract base class for transformation nodes.
|
||||
|
||||
Each transform node implements a single transformation step
|
||||
that takes data from the database, transforms it, and
|
||||
potentially writes results back.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing the input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with the transformed dataframe
|
||||
"""
|
||||
pass
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
"""Nodes to extract URL in text using regex patterns."""
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
class URLNode(TransformNode):
|
||||
"""Node that looks for URLs in the text-column in posts.
|
||||
Stores data in a new table urls:
|
||||
- id, post_id, url_raw, tld, host
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
logger.info("Init URL Node")
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create urls table if they don't exist."""
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS urls (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
post_id INTEGER,
|
||||
url_raw TEXT,
|
||||
tld TEXT,
|
||||
host TEXT,
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
|
||||
logger.info(f"Processing {len(input_df)} rows")
|
||||
|
||||
mappings = []
|
||||
for _, post_row in input_df.iterrows():
|
||||
post_id = post_row['id']
|
||||
post_text = post_row['text']
|
||||
|
||||
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
|
||||
|
||||
urls = re.findall(pattern, post_text)
|
||||
logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
hostname = parsed.netloc
|
||||
|
||||
# If the hostname starts with www. remove that part.
|
||||
if hostname[:4] == 'www.':
|
||||
hostname = hostname[4:]
|
||||
|
||||
# Extract TLD (last part after the last dot)
|
||||
tld = ""
|
||||
if hostname:
|
||||
parts = hostname.split('.')
|
||||
if len(parts) > 0:
|
||||
tld = parts[-1]
|
||||
|
||||
mappings.append({
|
||||
'post_id': post_id,
|
||||
'url_raw': url,
|
||||
'host': hostname,
|
||||
'tld': tld
|
||||
})
|
||||
logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse URL {url}: {e}")
|
||||
|
||||
result_df = pd.DataFrame(mappings)
|
||||
logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
|
||||
return result_df
|
||||
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
|
||||
if result_df.empty:
|
||||
logger.info("No URLs to store")
|
||||
return
|
||||
|
||||
result_df.to_sql('urls', con, if_exists='append', index=False)
|
||||
logger.info(f"Stored {len(result_df)} URLs to database")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext):
|
||||
"""Executes the URL Node.
|
||||
Writes to a new table urls and creates said table if it does not
|
||||
exist currently.
|
||||
|
||||
Args:
|
||||
con (sqlite3.Connection): SQLite database connection
|
||||
context (TransformContext): Transformcontext,
|
||||
containing the input dataframe of all posts
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe.
|
||||
"""
|
||||
logger.info("Starting URLNode transformation")
|
||||
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe. Skipping URLNode")
|
||||
return context
|
||||
|
||||
self._create_tables(con)
|
||||
result_df = self._process_data(input_df)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("Node transformation complete")
|
||||
|
||||
return TransformContext(input_df)
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
# Connect to database
|
||||
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
try:
|
||||
# Read posts from database
|
||||
df = pd.read_sql('SELECT * FROM posts;', con)
|
||||
logger.info(f"Loaded {len(df)} posts from database")
|
||||
|
||||
# Create context
|
||||
context = TransformContext(df)
|
||||
|
||||
# Run NerAuthorNode
|
||||
logger.info("Running NerAuthorNode...")
|
||||
node = URLNode()
|
||||
context = node.run(con, context)
|
||||
logger.info("NerAuthorNode complete")
|
||||
|
||||
|
||||
logger.info("All author nodes completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
name: knack-viz
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.11
|
||||
- pandas>=2.0.0
|
||||
- altair>=5.0.0
|
||||
- notebook
|
||||
- ipykernel
|
||||
- pip
|
||||
- pip:
|
||||
- vega_datasets
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -1,343 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "0ab5f064",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Libraries imported successfully!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sqlite3\n",
|
||||
"from pathlib import Path\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"print(\"Libraries imported successfully!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "94b2e3d9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tables in the database:\n",
|
||||
" - posttags\n",
|
||||
" - postcategories\n",
|
||||
" - tags\n",
|
||||
" - categories\n",
|
||||
" - posts\n",
|
||||
" - authors\n",
|
||||
" - post_authors\n",
|
||||
" - sqlite_sequence\n",
|
||||
" - urls\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to the database\n",
|
||||
"db_path = Path('../data/knack.sqlite')\n",
|
||||
"conn = sqlite3.connect(db_path)\n",
|
||||
"cursor = conn.cursor()\n",
|
||||
"\n",
|
||||
"# Get all table names\n",
|
||||
"cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
|
||||
"tables = cursor.fetchall()\n",
|
||||
"\n",
|
||||
"print(\"Tables in the database:\")\n",
|
||||
"for table in tables:\n",
|
||||
" print(f\" - {table[0]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "b3924728",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def query_db(query, args=(), one=False):\n",
|
||||
" cursor.execute(query, args)\n",
|
||||
" r = [dict((cursor.description[i][0], value) \\\n",
|
||||
" for i, value in enumerate(row)) for row in cursor.fetchall()]\n",
|
||||
" return (r[0] if r else None) if one else r"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "c0fdb0ba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n",
|
||||
"\n",
|
||||
"with open('json/tags.json', 'w') as file:\n",
|
||||
" file.write(json.dumps(q))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "df5c31b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n",
|
||||
"\n",
|
||||
"with open('json/categories.json', 'w') as file:\n",
|
||||
" file.write(json.dumps(q))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "101b971d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db(\"\"\"\n",
|
||||
"SELECT\n",
|
||||
" strftime('%Y-%m', date) AS month,\n",
|
||||
" category,\n",
|
||||
" COUNT(*) AS count\n",
|
||||
"FROM posts\n",
|
||||
"WHERE date > '2020-01-01' AND category NOT NULL\n",
|
||||
"GROUP BY strftime('%Y-%m', date), category\n",
|
||||
"ORDER BY month;\n",
|
||||
" \"\"\")\n",
|
||||
"\n",
|
||||
"with open('json/posts_per_month.json', 'w') as file:\n",
|
||||
" file.write(json.dumps(q))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "2f23046d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db(\"\"\"\n",
|
||||
"select name,\n",
|
||||
" min(type) as type,\n",
|
||||
" count(posts.id) as count\n",
|
||||
"from authors\n",
|
||||
"inner join post_authors on authors.id = author_id\n",
|
||||
"inner join posts on posts.id = post_id\n",
|
||||
" \n",
|
||||
"where category NOT like '%Presseartikel%'\n",
|
||||
" \n",
|
||||
"group by name\n",
|
||||
" \n",
|
||||
"order by count desc\n",
|
||||
"limit 25\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"with open('json/authors.json', 'w') as file:\n",
|
||||
" file.write(json.dumps(q))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "d4ae65f1",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "ruby"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tag_pairs = query_db(\"\"\"\n",
|
||||
" SELECT t1.tag AS source,\n",
|
||||
" t2.tag AS target,\n",
|
||||
" COUNT(*) AS weight\n",
|
||||
" FROM posttags pt1\n",
|
||||
" JOIN posttags pt2\n",
|
||||
" ON pt1.post_id = pt2.post_id\n",
|
||||
" AND pt1.tag_id < pt2.tag_id\n",
|
||||
" JOIN tags t1 ON t1.id = pt1.tag_id\n",
|
||||
" JOIN tags t2 ON t2.id = pt2.tag_id\n",
|
||||
" GROUP BY t1.tag, t2.tag\n",
|
||||
" HAVING weight > 1\n",
|
||||
" ORDER BY weight DESC;\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"with open('json/tag_chords.json', 'w') as f:\n",
|
||||
" f.write(json.dumps(tag_pairs))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "13062474",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "ruby"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db(\"\"\"\n",
|
||||
"select\n",
|
||||
"cast(umap_x*10 as int) as x,\n",
|
||||
"cast(umap_y*10 as int) as y,\n",
|
||||
"cast(umap_z*10 as int) as z,\n",
|
||||
"posts.id as id, category_id as c,\n",
|
||||
"SUBSTRING(title, 1, 12) as t\n",
|
||||
"\n",
|
||||
"from posts\n",
|
||||
"inner join postcategories on post_id = posts.id\n",
|
||||
"inner join categories on category_id = categories.id\n",
|
||||
"\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
|
||||
"\n",
|
||||
"with open('json/umap_embeddings.json', 'w') as f:\n",
|
||||
" f.write(json.dumps(q))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "e5378b17",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "ruby"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q = query_db(\"\"\"\n",
|
||||
"SELECT \n",
|
||||
"'knack[punkt]news' AS source, \n",
|
||||
"CASE \n",
|
||||
" WHEN tld_count < 10 THEN 'other'\n",
|
||||
" ELSE tld \n",
|
||||
"END AS target, \n",
|
||||
"SUM(tld_count) AS value\n",
|
||||
"FROM (\n",
|
||||
" SELECT tld, COUNT(*) as tld_count\n",
|
||||
" FROM urls \n",
|
||||
" WHERE tld IS NOT NULL \n",
|
||||
" GROUP BY tld\n",
|
||||
")\n",
|
||||
"GROUP BY target\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"q2 = query_db(\"\"\"\n",
|
||||
"SELECT \n",
|
||||
" tld AS source, \n",
|
||||
" CASE \n",
|
||||
" WHEN host_count < 10 THEN 'other'\n",
|
||||
" ELSE host \n",
|
||||
" END AS target, \n",
|
||||
" SUM(host_count) AS value\n",
|
||||
"FROM (\n",
|
||||
" SELECT tld, host, COUNT(*) as host_count\n",
|
||||
" FROM urls \n",
|
||||
" WHERE tld IS NOT NULL AND host IS NOT NULL \n",
|
||||
" GROUP BY tld, host\n",
|
||||
")\n",
|
||||
"WHERE source != \"\" AND target != 'other'\n",
|
||||
"GROUP BY tld, target\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"with open('json/urls_l1.json', 'w') as f:\n",
|
||||
" f.write(json.dumps(q))\n",
|
||||
"\n",
|
||||
"with open('json/urls_l2.json', 'w') as f:\n",
|
||||
" f.write(json.dumps(q2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "1501cb06",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "ruby"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
|
||||
" {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"q = query_db(\"\"\"\n",
|
||||
" SELECT \n",
|
||||
" a.name AS author_name,\n",
|
||||
" t.tag,\n",
|
||||
" COUNT(*) AS tag_count\n",
|
||||
"FROM authors a\n",
|
||||
"JOIN post_authors pa ON a.id = pa.author_id\n",
|
||||
"JOIN posttags pt ON pa.post_id = pt.post_id\n",
|
||||
"JOIN tags t ON pt.tag_id = t.id\n",
|
||||
"WHERE a.name = 'Antifa'\n",
|
||||
"GROUP BY a.id, a.name, t.id, t.tag\n",
|
||||
"ORDER BY tag_count DESC;\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"q"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "knack-viz",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.14"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue