30 changed files with 199 additions and 4546 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,3 @@
 data/
 venv/
-experiment/
-__pycache__/
 .DS_STORE
-.env
--- a/15
+++ b/15
@ -0,0 +1,15 @@
+FROM python:slim
+
+RUN mkdir /app
+RUN mkdir /data
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN apt update -y
+RUN apt install -y cron
+COPY crontab .
+RUN crontab crontab
+
+COPY main.py .
--- a/14
+++ b/14
@ -1,12 +1,2 @@
-volume:
-	docker volume create knack_data
-
-stop:
-	docker stop knack-scraper || true
-	docker rm knack-scraper || true
-
-up:
-	docker compose up -d
-
-down:
-	docker compose down
+build:
+	docker build -t knack-scraper .
--- a/README.md
+++ b/README.md
@ -1,18 +0,0 @@
-Knack-Scraper does exacly what its name suggests it does.
-Knack-Scraper scrapes knack.news and writes to an sqlite
-database for later usage.
-
-## Example for .env
-
-```
-NUM_THREADS=8
-NUM_SCRAPES=100
-DATABASE_LOCATION='./data/knack.sqlite'
-```
-
-## Run once
-
-```
-python main.py
-```
-
--- a/1
+++ b/1
@ -0,0 +1 @@
+5 4 * * * python /app/main.py
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,60 +0,0 @@
-services:
-  scraper:
-    build:
-      context: ./scrape
-      dockerfile: Dockerfile
-    image: knack-scraper
-    container_name: knack-scraper
-    env_file:
-      - scrape/.env
-    volumes:
-      - knack_data:/data
-    restart: unless-stopped
-
-  transform:
-    build:
-      context: ./transform
-      dockerfile: Dockerfile
-    image: knack-transform
-    container_name: knack-transform
-    env_file:
-      - transform/.env
-    volumes:
-      - knack_data:/data
-      - models:/models
-    restart: unless-stopped
-
-  explorer:
-    build:
-      context: ./explorer
-      dockerfile: Dockerfile
-    image: knack-explorer
-    container_name: knack-explorer
-    environment:
-      - PORT=4173
-      - SQLITE_PATH=/data/knack.sqlite
-    volumes:
-      - knack_data:/data:ro
-    ports:
-      - "4173:4173"
-    depends_on:
-      - transform
-    restart: unless-stopped
-
-  sqlitebrowser:
-    image: lscr.io/linuxserver/sqlitebrowser:latest
-    container_name: sqlitebrowser
-    environment:
-      - PUID=1000
-      - PGID=1000
-      - TZ=Etc/UTC
-    volumes:
-      - knack_data:/data
-    ports:
-      - "3000:3000" # noVNC web UI
-      - "3001:3001" # VNC server
-    restart: unless-stopped
-
-volumes:
-  knack_data:
-  models:
--- a/main.py
+++ b/main.py
@ -0,0 +1,167 @@
+#! python3
+import locale
+import logging
+import os
+import sqlite3
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+
+import pandas as pd
+import requests
+import tqdm
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger("knack-scraper")
+# ch = logging.StreamHandler()
+# formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+# ch.setFormatter(formatter)
+# ch.setLevel(logging.INFO)
+# logger.addHandler(ch)
+
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def download(id: int):
+    if id == 0:
+        return
+    base_url = "https://knack.news/"
+    url = f"{base_url}{id}"
+    res = requests.get(url)
+
+    # make sure we don't dos knack
+    time.sleep(2)
+
+    if not (200 <= res.status_code <= 300):
+        return
+
+    logger.info("Found promising page with id %d!", id)
+
+    content = res.content
+    soup = BeautifulSoup(content, "html.parser")
+    date_format = "%d. %B %Y"
+
+    # TODO FIXME: this fails inside the docker container
+    locale.setlocale(locale.LC_TIME, "de_DE")
+    pC = soup.find("div", {"class": "postContent"})
+
+    if pC is None:
+        # not a normal post
+        logger.info(
+            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
+        )
+        return
+
+    # every post has these fields
+    title = pC.find("h3", {"class": "postTitle"}).text
+    postText = pC.find("div", {"class": "postText"})
+
+    # these fields are possible but not required
+    # TODO: cleanup
+    try:
+        date_string = pC.find("span", {"class": "singledate"}).text
+        parsed_date = datetime.strptime(date_string, date_format)
+    except AttributeError:
+        parsed_date = None
+
+    try:
+        author = pC.find("span", {"class": "author"}).text
+    except AttributeError:
+        author = None
+
+    try:
+        category = pC.find("span", {"class": "categoryInfo"}).find_all()
+        category = [c.text for c in category]
+        category = ";".join(category)
+    except AttributeError:
+        category = None
+
+    try:
+        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
+        tags = ";".join(tags)
+    except AttributeError:
+        tags = None
+
+    img = pC.find("img", {"class": "postImage"})
+    if img is not None:
+        img = img["src"]
+
+    res_dict = {
+        "id": id,
+        "title": title,
+        "author": author,
+        "date": parsed_date,
+        "category": category,
+        "url": url,
+        "img_link": img,
+        "tags": tags,
+        "text": postText.text,
+        "html": str(postText),
+        "scraped_at": datetime.now(),
+    }
+
+    return res_dict
+
+
+def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
+    res = []
+
+    logger.info(
+        "Started parallel scrape of posts from id %d to id %d using %d threads.",
+        min_id,
+        max_id - 1,
+        num_threads,
+    )
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # Use a list comprehension to create a list of futures
+        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
+
+        for future in tqdm.tqdm(
+            futures, total=max_id - min_id
+        ):  # tqdm to track progress
+            post = future.result()
+            if post is not None:
+                res.append(post)
+
+    # sqlite can't handle lists so let's convert them to a single row csv
+    # TODO: make sure our database is properly normalized
+    df = pd.DataFrame(res)
+
+    return df
+
+
+def main():
+    num_threads = int(os.environ.get("NUM_THREADS", 8))
+    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
+    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
+
+    con = sqlite3.connect(database_location)
+    with con:
+        post_table_exists = table_exists("posts", con)
+
+        if post_table_exists:
+            logger.info("found posts retrieved earlier")
+            # retrieve max post id from db so
+            # we can skip retrieving known posts
+            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
+            logger.info("Got max id %d!", max_id_in_db)
+        else:
+            logger.info("no posts scraped so far - starting from 0")
+            # retrieve from 0 onwards
+            max_id_in_db = -1
+
+    con = sqlite3.connect(database_location)
+    df = run_downloads(
+        min_id=max_id_in_db + 1,
+        max_id=max_id_in_db + n_scrapes,
+        num_threads=num_threads,
+    )
+    df.to_sql("posts", con, if_exists="append")
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+beautifulsoup4==4.12.2
+certifi==2023.7.22
+charset-normalizer==3.3.0
+idna==3.4
+numpy==1.26.1
+pandas==2.1.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+requests==2.31.0
+six==1.16.0
+soupsieve==2.5
+tqdm==4.66.1
+tzdata==2023.3
+urllib3==2.0.7
--- a/scrape/Dockerfile
+++ b/scrape/Dockerfile
@ -1,29 +0,0 @@
-FROM python:slim
-
-RUN mkdir /app
-RUN mkdir /data
-
-#COPY /data/knack.sqlite /data
-
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY .env .
-
-RUN apt update -y
-RUN apt install -y cron locales
-
-COPY main.py .
-
-ENV PYTHONUNBUFFERED=1
-ENV LANG=de_DE.UTF-8
-ENV LC_ALL=de_DE.UTF-8
-
-# Create cron job that runs every 15 minutes with environment variables
-RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
-RUN chmod 0644 /etc/cron.d/knack-scraper
-RUN crontab /etc/cron.d/knack-scraper
-
-# Start cron in foreground
-CMD ["cron", "-f"]
--- a/scrape/main.py
+++ b/scrape/main.py
@ -1,262 +0,0 @@
-#! python3
-import logging
-import os
-import sqlite3
-import time
-from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime
-import sys
-
-from dotenv import load_dotenv
-import pandas as pd
-import requests
-from bs4 import BeautifulSoup
-
-load_dotenv()
-
-if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
-    logging_level = logging.INFO
-else:
-    logging_level = logging.DEBUG
-
-logging.basicConfig(
-    level=logging_level,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler(sys.stdout)
-    ]
-)
-logger = logging.getLogger("knack-scraper")
-
-def table_exists(tablename: str, con: sqlite3.Connection):
-    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
-    return len(con.execute(query, [tablename]).fetchall()) > 0
-
-
-def split_semicolon_list(value: str):
-    if pd.isna(value):
-        return []
-    return [item.strip() for item in str(value).split(';') if item.strip()]
-
-
-def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
-    """Extract unique dimension values and post-to-dimension mappings from a column."""
-    if postdf.empty or field_name not in postdf.columns:
-        return None, None
-
-    values = set()
-    mapping_rows = []
-
-    for post_id, raw in zip(postdf['id'], postdf[field_name]):
-        items = split_semicolon_list(raw)
-        for item in items:
-            values.add(item)
-            mapping_rows.append({'post_id': post_id, dim_col: item})
-
-    if not values:
-        return None, None
-
-    dim_df = pd.DataFrame({
-        'id': range(len(values)),
-        dim_col: sorted(values),
-    })
-    map_df = pd.DataFrame(mapping_rows)
-    return dim_df, map_df
-
-
-def store_dimension_and_mapping(
-    con: sqlite3.Connection,
-    dim_df: pd.DataFrame | None,
-    map_df: pd.DataFrame | None,
-    table_name: str,
-    dim_col: str,
-    mapping_table: str,
-    mapping_id_col: str,
-):
-    """Persist a dimension table and its mapping table, merging with existing values."""
-    if dim_df is None or dim_df.empty:
-        return
-
-    if table_exists(table_name, con):
-        existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
-        merged = pd.concat([existing, dim_df], ignore_index=True)
-        merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
-        merged['id'] = range(len(merged))
-    else:
-        merged = dim_df.copy()
-
-    # Replace table with merged content
-    merged.to_sql(table_name, con, if_exists="replace", index=False)
-
-    if map_df is None or map_df.empty:
-        return
-
-    value_to_id = dict(zip(merged[dim_col], merged['id']))
-    map_df = map_df.copy()
-    map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
-    map_df = map_df[['post_id', mapping_id_col]].dropna()
-    map_df.to_sql(mapping_table, con, if_exists="append", index=False)
-
-
-def download(id: int):
-    if id == 0:
-        return
-    base_url = "https://knack.news/"
-    url = f"{base_url}{id}"
-    res = requests.get(url)
-
-    # make sure we don't dos knack
-    time.sleep(2)
-
-    if not (200 <= res.status_code <= 300):
-        return
-
-    logger.debug("Found promising page with id %d!", id)
-
-    content = res.content
-    soup = BeautifulSoup(content, "html.parser")
-
-    pC = soup.find("div", {"class": "postContent"})
-
-    if pC is None:
-        # not a normal post
-        logger.debug(
-            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
-        )
-        return
-
-    # every post has these fields
-    title = pC.find("h3", {"class": "postTitle"}).text
-    postText = pC.find("div", {"class": "postText"})
-
-    # these fields are possible but not required
-    # TODO: cleanup
-    try:
-        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
-        day = int(date_parts[0][:-1])
-        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
-        month = months[date_parts[1]]
-        year = int(date_parts[2])
-        parsed_date = datetime(year, month, day)
-    except Exception:
-        parsed_date = None
-
-    try:
-        author = pC.find("span", {"class": "author"}).text
-    except AttributeError:
-        author = None
-
-    try:
-        category = pC.find("span", {"class": "categoryInfo"}).find_all()
-        category = [c.text for c in category if c.text != 'Alle Artikel']
-        category = ";".join(category)
-    except AttributeError:
-        category = None
-
-    try:
-        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
-        tags = ";".join(tags)
-    except AttributeError:
-        tags = None
-
-    img = pC.find("img", {"class": "postImage"})
-    if img is not None:
-        img = img["src"]
-
-    res_dict = {
-        "id": id,
-        "title": title,
-        "author": author,
-        "date": parsed_date,
-        "category": category,
-        "url": url,
-        "img_link": img,
-        "tags": tags,
-        "text": postText.text,
-        "html": str(postText),
-        "scraped_at": datetime.now(),
-        "is_cleaned": False
-    }
-
-    return res_dict
-
-
-def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
-    res = []
-
-    logger.info(
-        "Started parallel scrape of posts from id %d to id %d using %d threads.",
-        min_id,
-        max_id - 1,
-        num_threads,
-    )
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        # Use a list comprehension to create a list of futures
-        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
-
-        for future in futures:
-            post = future.result()
-            if post is not None:
-                res.append(post)
-
-    postdf = pd.DataFrame(res)
-    return postdf
-
-
-def main():
-    num_threads = int(os.environ.get("NUM_THREADS", 8))
-    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
-    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
-
-    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
-
-    con = sqlite3.connect(database_location)
-    with con:
-        if table_exists("posts", con):
-            logger.info("found posts retrieved earlier")
-            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
-            logger.info("Got max id %d!", max_id_in_db)
-        else:
-            logger.info("no posts scraped so far - starting from 0")
-            max_id_in_db = -1
-
-        postdf = run_downloads(
-            min_id=max_id_in_db + 1,
-            max_id=max_id_in_db + n_scrapes,
-            num_threads=num_threads,
-        )
-
-        # Drop category and tags columns as they're stored in separate tables
-        postdf = postdf.drop(columns=['category', 'tags'])
-        postdf.to_sql("posts", con, if_exists="append", index=False)
-
-        # Tags
-        tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
-        store_dimension_and_mapping(
-            con,
-            tag_dim,
-            tag_map,
-            table_name="tags",
-            dim_col="tag",
-            mapping_table="posttags",
-            mapping_id_col="tag_id",
-        )
-
-        # Categories
-        category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
-        store_dimension_and_mapping(
-            con,
-            category_dim,
-            category_map,
-            table_name="categories",
-            dim_col="category",
-            mapping_table="postcategories",
-            mapping_id_col="category_id",
-        )
-
-        logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
-
-
-if __name__ == "__main__":
-    main()
--- a/scrape/requirements.txt
+++ b/scrape/requirements.txt
@ -1,4 +0,0 @@
-pandas
-requests
-bs4
-dotenv
--- a/transform/.env.example
+++ b/transform/.env.example
@ -1,4 +0,0 @@
-LOGGING_LEVEL=INFO
-DB_PATH=/data/knack.sqlite
-MAX_CLEANED_POSTS=1000
-COMPUTE_DEVICE=mps
--- a/transform/Dockerfile
+++ b/transform/Dockerfile
@ -1,50 +0,0 @@
-FROM python:3.12-slim
-
-RUN mkdir -p /app /data /models
-
-# Install build dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    gcc \
-    g++ \
-    gfortran \
-    libopenblas-dev \
-    liblapack-dev \
-    pkg-config \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1
-ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1
-
-ENV GTE_MODEL_ID=thenlper/gte-large
-ENV GTE_MODEL_PATH=/models/thenlper/gte-large
-
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY .env .
-
-RUN apt update -y
-RUN apt install -y cron locales
-
-# Ensure GLiNER helper scripts are available
-COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh
-# Ensure GTE helper scripts are available
-COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh
-COPY entrypoint.sh /usr/local/bin/entrypoint.sh
-RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh
-
-COPY *.py .
-
-# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
-# Testing every 30 Minutes */30 * * * *
-RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
-RUN chmod 0644 /etc/cron.d/knack-transform
-RUN crontab /etc/cron.d/knack-transform
-
-# Persist models between container runs
-VOLUME /models
-
-CMD ["/usr/local/bin/entrypoint.sh"]
-#CMD ["python", "main.py"]
--- a/transform/README.md
+++ b/transform/README.md
@ -1,67 +0,0 @@
-# Knack Transform
-
-Data transformation pipeline for the Knack scraper project.
-
-## Overview
-
-This folder contains the transformation logic that processes data from the SQLite database. It runs on a scheduled basis (every weekend) via cron.
-
-The pipeline supports **parallel execution** of independent transform nodes, allowing you to leverage multi-core processors for faster data transformation.
-
-## Structure
-
- `base.py` - Abstract base class for transform nodes
- `pipeline.py` - Parallel pipeline orchestration system
- `main.py` - Main entry point and pipeline execution
- `author_node.py` - NER-based author classification node
- `example_node.py` - Template for creating new nodes
- `Dockerfile` - Docker image configuration with cron setup
- `requirements.txt` - Python dependencies
-
-## Transform Nodes
-
-Transform nodes inherit from `TransformNode` and implement the `run` method:
-
-```python
-from base import TransformNode, TransformContext
-import sqlite3
-
-class MyTransform(TransformNode):
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        df = context.get_dataframe()
-        
-        # Transform logic here
-        transformed_df = df.copy()
-        # ... your transformations ...
-        
-        # Optionally write back to database
-        transformed_df.to_sql("my_table", con, if_exists="replace", index=False)
-        
-        return TransformContext(transformed_df)
-```
-
-## Configuration
-
-Copy `.env.example` to `.env` and configure:
-
- `LOGGING_LEVEL` - Log level (INFO or DEBUG)
- `DB_PATH` - Path to SQLite database
-
-## Running
-
-### With Docker
-
-```bash
-docker build -t knack-transform .
-docker run -v $(pwd)/data:/data knack-transform
-```
-
-### Locally
-
-```bash
-python main.py
-```
-
-## Cron Schedule
-
-The Docker container runs the transform pipeline every Sunday at 3 AM.
--- a/transform/app.log
+++ b/transform/app.log
@ -1,303 +0,0 @@
-2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
-2026-01-18 15:11:40,254 - knack-transform - INFO -     index   id                                              title           author  ... embedding umap_x umap_y  row
-0       0   41                                           Über uns             None  ...         0    0.0    0.0  0.0
-1       1   52                                            Kontakt             None  ...         0    0.0    0.0  0.0
-2       2   99                                       Safety First             None  ...         0    0.0    0.0  0.0
-3       3  110  Datenleck bei Polizei Sachsen – Funkmitschnitt...    chakalaka_161  ...         0    0.0    0.0  0.0
-4       4  115  Feuriger Widerstand bei der Räumung der Tiefe ...           anonym  ...         0    0.0    0.0  0.0
-..    ...  ...                                                ...              ...  ...       ...    ...    ...  ...
-95     10  643  Bericht vom 6. Prozesstag im Antifa-Ost Verfah...  Soli Antifa Ost  ...         0    0.0    0.0  0.0
-96     11  650  #le2310 // Aufruf Ost // Kein Freund – Kein He...           anonym  ...         0    0.0    0.0  0.0
-97     12  652  Aufruf: Am 23. Oktober von Hamburg nach Leipzi...           anonym  ...         0    0.0    0.0  0.0
-98     13  654                        Nach der Demo ging’s bergab   kreuzer online  ...         0    0.0    0.0  0.0
-99     14  659  Polizistin unterhält romantische Brieffreundsc...      Kira Ayyadi  ...         0    0.0    0.0  0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation
-2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
-2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
-2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
-2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete
-2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results
-2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully
-2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete
-2026-01-18 15:11:55,335 - knack-transform - INFO -     index   id                                              title  ... umap_x umap_y  row
-0       0   41                                           Über uns  ...    0.0    0.0  0.0
-1       1   52                                            Kontakt  ...    0.0    0.0  0.0
-2       2   99                                       Safety First  ...    0.0    0.0  0.0
-3       3  110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0  0.0
-4       4  115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0  0.0
-..    ...  ...                                                ...  ...    ...    ...  ...
-95     10  643  Bericht vom 6. Prozesstag im Antifa-Ost Verfah...  ...    0.0    0.0  0.0
-96     11  650  #le2310 // Aufruf Ost // Kein Freund – Kein He...  ...    0.0    0.0  0.0
-97     12  652  Aufruf: Am 23. Oktober von Hamburg nach Leipzi...  ...    0.0    0.0  0.0
-98     13  654                        Nach der Demo ging’s bergab  ...    0.0    0.0  0.0
-99     14  659  Polizistin unterhält romantische Brieffreundsc...  ...    0.0    0.0  0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
-2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation
-2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
-2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
-2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
-2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
-2026-01-18 15:15:27,968 - knack-transform - INFO -     index    id                                              title                                  author  ... embedding umap_x umap_y  row
-0      15   672  Lina E. als Widerständlerin? CDU fordert Eingr...                                     LVZ  ...         0    0.0    0.0  0.0
-1      16   674  Unschuldig verfolgt (4): Lina E., Henry A. und...                         Michael Freitag  ...         0    0.0    0.0  0.0
-2      17   680                                      Kein Verdacht         Konrad Litschko & Andreas Speit  ...         0    0.0    0.0  0.0
-3      18   701  Jede Räumung hat ihren Preis – Aufruf von Leip...                         LeipzigBesetzen  ...         0    0.0    0.0  0.0
-4      19   703  From Berlin to Leipzig – TOGETHER IN OUR CITIE...                         interkiezionale  ...         0    0.0    0.0  0.0
-..    ...   ...                                                ...                                     ...  ...       ...    ...    ...  ...
-95     32  1131  Nehmt ihr uns die Häuser ab, haun wir euch Gre...            G19 und BikeKitchen Freiburg  ...         0    0.0    0.0  0.0
-96     33  1136  Interview – Linksextreme aus Leipzig rechtfert...                                     MDR  ...         0    0.0    0.0  0.0
-97     34  1147  Polizei-Großaufgebot soll Sachsens Landtag sch...  sächsische Zeitung - Annette Binninger  ...         0    0.0    0.0  0.0
-98     35  1149  Fackel-Protest: Sachsens Innenminister unter D...  sächsische Zeitung - Annette Binninger  ...         0    0.0    0.0  0.0
-99     36  1154  23 Thesen über die Revolte – Wie können wir au...            anonyme*r Mensch aus Leipzig  ...         0    0.0    0.0  0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation
-2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
-2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
-2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
-2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete
-2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results
-2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully
-2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete
-2026-01-18 15:15:34,885 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y  row
-0      15   672  Lina E. als Widerständlerin? CDU fordert Eingr...  ...    0.0    0.0  0.0
-1      16   674  Unschuldig verfolgt (4): Lina E., Henry A. und...  ...    0.0    0.0  0.0
-2      17   680                                      Kein Verdacht  ...    0.0    0.0  0.0
-3      18   701  Jede Räumung hat ihren Preis – Aufruf von Leip...  ...    0.0    0.0  0.0
-4      19   703  From Berlin to Leipzig – TOGETHER IN OUR CITIE...  ...    0.0    0.0  0.0
-..    ...   ...                                                ...  ...    ...    ...  ...
-95     32  1131  Nehmt ihr uns die Häuser ab, haun wir euch Gre...  ...    0.0    0.0  0.0
-96     33  1136  Interview – Linksextreme aus Leipzig rechtfert...  ...    0.0    0.0  0.0
-97     34  1147  Polizei-Großaufgebot soll Sachsens Landtag sch...  ...    0.0    0.0  0.0
-98     35  1149  Fackel-Protest: Sachsens Innenminister unter D...  ...    0.0    0.0  0.0
-99     36  1154  23 Thesen über die Revolte – Wie können wir au...  ...    0.0    0.0  0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
-2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation
-2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
-2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
-2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
-2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer...
-2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
-2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
-2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete
-2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results
-2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
-2026-01-18 15:26:34,426 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y umap_z
-0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...    0.0    0.0    0.0
-1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...    0.0    0.0    0.0
-2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...    0.0    0.0    0.0
-3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...    0.0    0.0    0.0
-4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...    0.0    0.0    0.0
-..    ...   ...                                                ...  ...    ...    ...    ...
-95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...    0.0    0.0    0.0
-96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...    0.0    0.0    0.0
-97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...    0.0    0.0    0.0
-98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...    0.0    0.0    0.0
-99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...    0.0    0.0    0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation
-2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
-2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
-2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
-2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete
-2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results
-2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully
-2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete
-2026-01-18 15:26:41,115 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y umap_z
-0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...    0.0    0.0    0.0
-1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...    0.0    0.0    0.0
-2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...    0.0    0.0    0.0
-3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...    0.0    0.0    0.0
-4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...    0.0    0.0    0.0
-..    ...   ...                                                ...  ...    ...    ...    ...
-95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...    0.0    0.0    0.0
-96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...    0.0    0.0    0.0
-97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...    0.0    0.0    0.0
-98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...    0.0    0.0    0.0
-99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...    0.0    0.0    0.0
-
-[100 rows x 17 columns]
-2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
-2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation
-2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows
-2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
-2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
-2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
-2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer...
-2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
-2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
-2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete
-2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results
-2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully
-2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete
-2026-01-18 15:26:44,282 - knack-transform - INFO -     index    id                                              title  ...    umap_x    umap_y    umap_z
-0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...  5.537961  3.468988  3.757369
-1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...  4.980662  1.629360  3.269084
-2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...  1.055900  2.460792  2.076612
-3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...  4.128685  5.247468  4.904186
-4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...  5.383136  2.068369  4.368077
-..    ...   ...                                                ...  ...       ...       ...       ...
-95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...  5.897925  5.151130  3.241154
-96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...  2.919075  5.341392  4.516587
-97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...  4.852142  1.179675  4.241960
-98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...  5.231822  4.983705  3.941314
-99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...  0.999596  1.613693  2.039646
-
-[100 rows x 17 columns]
-2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed
-2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
-2026-01-18 15:28:43,420 - knack-transform - INFO -       index     id                                              title  ... umap_x umap_y umap_z
-0         1     41                                           Über uns  ...    0.0    0.0    0.0
-1         2     52                                            Kontakt  ...    0.0    0.0    0.0
-2         3     99                                       Safety First  ...    0.0    0.0    0.0
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0    0.0
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0    0.0
-...     ...    ...                                                ...  ...    ...    ...    ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...    0.0    0.0    0.0
-3674   3675  14619                            „Klassenhass“ reloaded?  ...    0.0    0.0    0.0
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...    0.0    0.0    0.0
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...    0.0    0.0    0.0
-3677   3678  14627                        Applaus für die Angeklagten  ...    0.0    0.0    0.0
-
-[3678 rows x 17 columns]
-2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation
-2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows
-2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
-2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
-2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
-2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete
-2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results
-2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully
-2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete
-2026-01-18 15:30:42,374 - knack-transform - INFO -       index     id                                              title  ... umap_x umap_y umap_z
-0         1     41                                           Über uns  ...    0.0    0.0    0.0
-1         2     52                                            Kontakt  ...    0.0    0.0    0.0
-2         3     99                                       Safety First  ...    0.0    0.0    0.0
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0    0.0
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0    0.0
-...     ...    ...                                                ...  ...    ...    ...    ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...    0.0    0.0    0.0
-3674   3675  14619                            „Klassenhass“ reloaded?  ...    0.0    0.0    0.0
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...    0.0    0.0    0.0
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...    0.0    0.0    0.0
-3677   3678  14627                        Applaus für die Angeklagten  ...    0.0    0.0    0.0
-
-[3678 rows x 17 columns]
-2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
-2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation
-2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows
-2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
-2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows
-2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192)
-2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer...
-2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3)
-2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
-2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete
-2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results
-2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully
-2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete
-2026-01-18 15:31:00,255 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed
-2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
-2026-01-18 15:35:37,186 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation
-2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows
-2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
-2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
-2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
-2026-01-18 15:36:25,468 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:37:37,881 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed
-2026-01-18 15:39:23,498 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:39:52,241 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
-0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
-1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
-2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
-3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
-4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
-...     ...    ...                                                ...  ...        ...       ...       ...
-3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
-3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
-3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
-3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
-3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
-
-[3678 rows x 17 columns]
-2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed
--- a/transform/author_node.py
+++ b/transform/author_node.py
@ -1,469 +0,0 @@
-"""Author classification transform node using NER."""
-import os
-import sqlite3
-import pandas as pd
-import logging
-import fuzzysearch
-from concurrent.futures import ThreadPoolExecutor
-
-from pipeline import TransformContext
-from transform_node import TransformNode
-
-logger = logging.getLogger("knack-transform")
-
-try:
-    from gliner import GLiNER
-    import torch
-    GLINER_AVAILABLE = True
-except ImportError:
-    GLINER_AVAILABLE = False
-    logging.warning("GLiNER not available. Install with: pip install gliner")
-
-class NerAuthorNode(TransformNode):
-    """Transform node that extracts and classifies authors using NER.
-    
-    Creates two tables:
-    - authors: stores unique authors with their type (Person, Organisation, etc.)
-    - post_authors: maps posts to their authors
-    """
-    
-    def __init__(self, model_name: str = "urchade/gliner_multi-v2.1", 
-                 model_path: str = None,
-                 threshold: float = 0.5, 
-                 max_workers: int = 64,
-                 device: str = "cpu"):
-        """Initialize the AuthorNode.
-        
-        Args:
-            model_name: GLiNER model to use
-            model_path: Optional local path to a downloaded GLiNER model
-            threshold: Confidence threshold for entity predictions
-            max_workers: Number of parallel workers for prediction
-            device: Device to run model on ('cpu', 'cuda', 'mps')
-        """
-        self.model_name = model_name
-        self.model_path = model_path or os.environ.get('GLINER_MODEL_PATH')
-        self.threshold = threshold
-        self.max_workers = max_workers
-        self.device = device
-        self.model = None
-        self.labels = ["Person", "Organisation", "Email", "Newspaper", "NGO"]
-    
-    def _setup_model(self):
-        """Initialize the NER model."""
-        if not GLINER_AVAILABLE:
-            raise ImportError("GLiNER is required for AuthorNode. Install with: pip install gliner")
-        
-        model_source = None
-        if self.model_path:
-            if os.path.exists(self.model_path):
-                model_source = self.model_path
-                logger.info(f"Loading GLiNER model from local path: {self.model_path}")
-            else:
-                logger.warning(f"GLINER_MODEL_PATH '{self.model_path}' not found; falling back to hub model {self.model_name}")
-
-        if model_source is None:
-            model_source = self.model_name
-            logger.info(f"Loading GLiNER model from hub: {self.model_name}")
-        
-        if self.device == "cuda" and torch.cuda.is_available():
-            self.model = GLiNER.from_pretrained(
-                model_source, 
-                max_length=255
-            ).to('cuda', dtype=torch.float16)
-        elif self.device == "mps" and torch.backends.mps.is_available():
-            self.model = GLiNER.from_pretrained(
-                model_source, 
-                max_length=255
-            ).to('mps', dtype=torch.float16)
-        else:
-            self.model = GLiNER.from_pretrained(
-                model_source, 
-                max_length=255
-            )
-        
-        logger.info("Model loaded successfully")
-    
-    def _predict(self, text_data: dict):
-        """Predict entities for a single author text.
-        
-        Args:
-            text_data: Dict with 'author' and 'id' keys
-            
-        Returns:
-            Tuple of (predictions, post_id) or None
-        """
-        if text_data is None or text_data.get('author') is None:
-            return None
-        
-        predictions = self.model.predict_entities(
-            text_data['author'], 
-            self.labels, 
-            threshold=self.threshold
-        )
-        return predictions, text_data['id']
-    
-    def _classify_authors(self, posts_df: pd.DataFrame):
-        """Classify all authors in the posts dataframe.
-        
-        Args:
-            posts_df: DataFrame with 'id' and 'author' columns
-            
-        Returns:
-            List of dicts with 'text', 'label', 'id' keys
-        """
-        if self.model is None:
-            self._setup_model()
-        
-        # Prepare input data
-        authors_data = []
-        for idx, row in posts_df.iterrows():
-            if pd.notna(row['author']):
-                authors_data.append({
-                    'author': row['author'],
-                    'id': row['id']
-                })
-        
-        logger.info(f"Classifying {len(authors_data)} authors")
-        
-        results = []
-        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-            futures = [executor.submit(self._predict, data) for data in authors_data]
-            
-            for future in futures:
-                result = future.result()
-                if result is not None:
-                    predictions, post_id = result
-                    for pred in predictions:
-                        results.append({
-                            'text': pred['text'],
-                            'label': pred['label'],
-                            'id': post_id
-                        })
-        
-        logger.info(f"Classification complete. Found {len(results)} author entities")
-        return results
-    
-    def _create_tables(self, con: sqlite3.Connection):
-        """Create authors and post_authors tables if they don't exist."""
-        logger.info("Creating authors tables")
-        
-        con.execute("""
-            CREATE TABLE IF NOT EXISTS authors (
-                id INTEGER PRIMARY KEY,
-                name TEXT,
-                type TEXT,
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-            )
-        """)
-        
-        con.execute("""
-            CREATE TABLE IF NOT EXISTS post_authors (
-                post_id INTEGER,
-                author_id INTEGER,
-                PRIMARY KEY (post_id, author_id),
-                FOREIGN KEY (post_id) REFERENCES posts(id),
-                FOREIGN KEY (author_id) REFERENCES authors(id)
-            )
-        """)
-        
-        con.commit()
-    
-    def _store_authors(self, con: sqlite3.Connection, results: list):
-        """Store classified authors and their mappings.
-        
-        Args:
-            con: Database connection
-            results: List of classification results
-        """
-        if not results:
-            logger.info("No authors to store")
-            return
-        
-        # Convert results to DataFrame
-        results_df = pd.DataFrame(results)
-        
-        # Get unique authors with their types
-        unique_authors = results_df[['text', 'label']].drop_duplicates()
-        unique_authors.columns = ['name', 'type']
-        
-        # Get existing authors
-        existing_authors = pd.read_sql("SELECT id, name FROM authors", con)
-        
-        # Find new authors to insert
-        if not existing_authors.empty:
-            new_authors = unique_authors[~unique_authors['name'].isin(existing_authors['name'])]
-        else:
-            new_authors = unique_authors
-        
-        if not new_authors.empty:
-            logger.info(f"Inserting {len(new_authors)} new authors")
-            new_authors.to_sql('authors', con, if_exists='append', index=False)
-        
-        # Get all authors with their IDs
-        all_authors = pd.read_sql("SELECT id, name FROM authors", con)
-        name_to_id = dict(zip(all_authors['name'], all_authors['id']))
-        
-        # Create post_authors mappings
-        mappings = []
-        for _, row in results_df.iterrows():
-            author_id = name_to_id.get(row['text'])
-            if author_id:
-                mappings.append({
-                    'post_id': row['id'],
-                    'author_id': author_id
-                })
-        
-        if mappings:
-            mappings_df = pd.DataFrame(mappings).drop_duplicates()
-            
-            # Clear existing mappings for these posts (optional, depends on your strategy)
-            # post_ids = tuple(mappings_df['post_id'].unique())
-            # con.execute(f"DELETE FROM post_authors WHERE post_id IN ({','.join('?' * len(post_ids))})", post_ids)
-            
-            logger.info(f"Creating {len(mappings_df)} post-author mappings")
-            mappings_df.to_sql('post_authors', con, if_exists='append', index=False)
-        
-        con.commit()
-        logger.info("Authors and mappings stored successfully")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the author classification transformation.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing posts dataframe
-            
-        Returns:
-            TransformContext with classified authors dataframe
-        """
-        logger.info("Starting AuthorNode transformation")
-        
-        posts_df = context.get_dataframe()
-        
-        # Ensure required columns exist
-        if 'author' not in posts_df.columns:
-            logger.warning("No 'author' column in dataframe. Skipping AuthorNode.")
-            return context
-        
-        # Create tables
-        self._create_tables(con)
-        
-        # Classify authors
-        results = self._classify_authors(posts_df)
-        
-        # Store results
-        self._store_authors(con, results)
-        
-        # Return context with results
-        logger.info("AuthorNode transformation complete")
-        
-        return TransformContext(posts_df)
-
-
-class FuzzyAuthorNode(TransformNode):
-    """FuzzyAuthorNode
-
-    This Node takes in data and rules of authornames that have been classified already
-    and uses those 'rule' to find more similar fields.
-    """
-    
-    def __init__(self, 
-                 max_l_dist: int = 1,):
-        """Initialize FuzzyAuthorNode.
-        
-        Args:
-            max_l_dist: The number of 'errors' that are allowed by the fuzzy search algorithm
-        """
-        self.max_l_dist = max_l_dist
-        logger.info(f"Initialized FuzzyAuthorNode with max_l_dist={max_l_dist}")
-    
-    def _process_data(self, con: sqlite3.Connection, df: pd.DataFrame) -> pd.DataFrame:
-        """Process the input dataframe.
-        
-        This is where your main transformation logic goes.
-        
-        Args:
-            con: Database connection
-            df: Input dataframe from context
-            
-        Returns:
-            Processed dataframe
-        """
-        logger.info(f"Processing {len(df)} rows")
-
-        # Retrieve all known authors from the authors table as 'rules'
-        authors_df = pd.read_sql("SELECT id, name FROM authors", con)
-        
-        if authors_df.empty:
-            logger.warning("No authors found in database for fuzzy matching")
-            return pd.DataFrame(columns=['post_id', 'author_id'])
-        
-        # Get existing post-author mappings to avoid duplicates
-        existing_mappings = pd.read_sql(
-            "SELECT post_id, author_id FROM post_authors", con
-        )
-        existing_post_ids = set(existing_mappings['post_id'].unique())
-        
-        logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
-        logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
-    
-        # Filter to posts without author mappings and with non-null author field
-        if 'author' not in df.columns or 'id' not in df.columns:
-            logger.warning("Missing 'author' or 'id' column in input dataframe")
-            return pd.DataFrame(columns=['post_id', 'author_id'])
-        
-        posts_to_process = df[
-            (df['id'].notna()) & 
-            (df['author'].notna()) & 
-            (~df['id'].isin(existing_post_ids))
-        ]
-        
-        logger.info(f"Processing {len(posts_to_process)} posts for fuzzy matching")
-        
-        # Perform fuzzy matching
-        mappings = []
-        for _, post_row in posts_to_process.iterrows():
-            post_id = post_row['id']
-            post_author = str(post_row['author'])
-            
-            # Try to find matches against all known author names
-            for _, author_row in authors_df.iterrows():
-                author_id = author_row['id']
-                author_name = str(author_row['name'])
-                # for author names < than 2 characters I want a fault tolerance of 0!
-                l_dist = self.max_l_dist if len(author_name) > 2 else 0
-                
-                # Use fuzzysearch to find matches with allowed errors
-                matches = fuzzysearch.find_near_matches(
-                    author_name,
-                    post_author,
-                    max_l_dist=l_dist, 
-                )
-                
-                if matches:
-                    logger.debug(f"Found fuzzy match: '{author_name}' in '{post_author}' for post {post_id}")
-                    mappings.append({
-                        'post_id': post_id,
-                        'author_id': author_id
-                    })
-                    # Only take the first match per post to avoid multiple mappings
-                    break
-        
-        # Create result dataframe
-        result_df = pd.DataFrame(mappings, columns=['post_id', 'author_id']) if mappings else pd.DataFrame(columns=['post_id', 'author_id'])
-        
-        logger.info(f"Processing complete. Found {len(result_df)} fuzzy matches")
-        return result_df
-    
-    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
-        """Store results back to the database.
-        
-        Uses INSERT OR IGNORE to avoid inserting duplicates.
-        
-        Args:
-            con: Database connection
-            df: Processed dataframe to store
-        """
-        if df.empty:
-            logger.info("No results to store")
-            return
-        
-        logger.info(f"Storing {len(df)} results")
-        
-        # Use INSERT OR IGNORE to handle duplicates (respects PRIMARY KEY constraint)
-        cursor = con.cursor()
-        inserted_count = 0
-        
-        for _, row in df.iterrows():
-            cursor.execute(
-                "INSERT OR IGNORE INTO post_authors (post_id, author_id) VALUES (?, ?)",
-                (int(row['post_id']), int(row['author_id']))
-            )
-            if cursor.rowcount > 0:
-                inserted_count += 1
-        
-        con.commit()
-        logger.info(f"Results stored successfully. Inserted {inserted_count} new mappings, skipped {len(df) - inserted_count} duplicates")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        This is the main entry point called by the pipeline.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing input dataframe
-            
-        Returns:
-            TransformContext with processed dataframe
-        """
-        logger.info("Starting FuzzyAuthorNode transformation")
-        
-        # Get input dataframe from context
-        input_df = context.get_dataframe()
-        
-        # Validate input
-        if input_df.empty:
-            logger.warning("Empty dataframe provided to FuzzyAuthorNode")
-            return context
-        
-        # Process the data
-        result_df = self._process_data(con, input_df)
-        
-        # Store results
-        self._store_results(con, result_df)
-        
-        logger.info("FuzzyAuthorNode transformation complete")
-        
-        # Return new context with results
-        return TransformContext(input_df)
-
-
-def main():
-    import sys
-    
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler(sys.stdout)
-        ]
-    )
-    logger = logging.getLogger("knack-transform")
-    
-    # Connect to database
-    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
-    con = sqlite3.connect(db_path)
-    
-    try:
-        # Read posts from database
-        df = pd.read_sql('SELECT * FROM posts;', con)
-        logger.info(f"Loaded {len(df)} posts from database")
-        
-        # Create context
-        context = TransformContext(df)
-        
-        # Run NerAuthorNode
-        logger.info("Running NerAuthorNode...")
-        ner_node = NerAuthorNode(device="mps")
-        context = ner_node.run(con, context)
-        logger.info("NerAuthorNode complete")
-        
-        # Run FuzzyAuthorNode
-        logger.info("Running FuzzyAuthorNode...")
-        fuzzy_node = FuzzyAuthorNode(max_l_dist=1)
-        context = fuzzy_node.run(con, context)
-        logger.info("FuzzyAuthorNode complete")
-        
-        logger.info("All author nodes completed successfully!")
-        
-    except Exception as e:
-        logger.error(f"Error during transformation: {e}", exc_info=True)
-        raise
-    finally:
-        con.close()
-
-
-if __name__ == '__main__':
-    main()
--- a/transform/embeddings_node.py
+++ b/transform/embeddings_node.py
@ -1,545 +0,0 @@
-"""Classes of Transformernodes that have to do with
-text processing.
-
- TextEmbeddingNode calculates text embeddings
- UmapNode calculates xy coordinates on those vector embeddings
- SimilarityNode calculates top n similar posts based on those embeddings
-    using the spectral distance.
-"""
-from pipeline import TransformContext
-from transform_node import TransformNode
-import sqlite3
-import pandas as pd
-import logging
-import os
-import numpy as np
-import sys
-import pickle
-import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D
-
-logger = logging.getLogger("knack-transform")
-
-try: 
-    from sentence_transformers import SentenceTransformer
-    import torch
-    GTE_AVAILABLE = True
-except ImportError:
-    GTE_AVAILABLE = False
-    logging.warning("GTE not available. Install with pip!")
-
-try:
-    import umap
-    UMAP_AVAILABLE = True
-except ImportError:
-    UMAP_AVAILABLE = False
-    logging.warning("UMAP not available. Install with pip install umap-learn!")
-
-class TextEmbeddingNode(TransformNode):
-    """Calculates vector embeddings based on a dataframe
-    of posts.
-    """
-    def __init__(self, 
-                 model_name: str = "thenlper/gte-large",
-                 model_path: str = None,
-                 device: str = "cpu"):
-        """Initialize the ExampleNode.
-        
-        Args:
-            model_name: Name of the ML Model to calculate text embeddings
-            model_path: Optional local path to a downloaded embedding model
-            device: Device to use for computations ('cpu', 'cuda', 'mps')
-        """
-        self.model_name = model_name
-        self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
-        self.device = device
-        self.model = None
-        logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
-    
-    def _setup_model(self):
-        """Init the Text Embedding Model."""
-        if not GTE_AVAILABLE:
-            raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
-        
-        model_source = None
-        if self.model_path:
-            if os.path.exists(self.model_path):
-                # Check if it's a valid model directory
-                if os.path.exists(os.path.join(self.model_path, 'config.json')):
-                    model_source = self.model_path
-                    logger.info(f"Loading GTE model from local path: {self.model_path}")
-                else:
-                    logger.warning(f"GTE_MODEL_PATH '{self.model_path}' found but missing config.json; Falling back to hub model {self.model_name}")
-            else:
-                logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
-
-        if model_source is None:
-            model_source = self.model_name
-            logger.info(f"Loading GTE model from the hub: {self.model_name}")
-
-        try:
-            if self.device == "cuda" and torch.cuda.is_available():
-                self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
-            elif self.device == "mps" and torch.backends.mps.is_available():
-                self.model = SentenceTransformer(model_source).to('mps', dtype=torch.float16)
-            else:
-                self.model = SentenceTransformer(model_source)
-            logger.info(f"Successfully loaded GTE model from: {model_source}")
-        except Exception as e:
-            logger.error(f"Failed to load GTE model from {model_source}: {e}")
-            raise
-
-    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Process the input dataframe.
-        
-        Calculates an embedding as a np.array. 
-        Also pickles that array to prepare it to
-        storage in the database.
-        
-        Args:
-            df: Input dataframe from context
-            
-        Returns:
-            Processed dataframe
-        """
-        logger.info(f"Processing {len(df)} rows")
-
-        if self.model is None:
-            self._setup_model()
-        
-        # Example: Add a new column based on existing data
-        result_df = df.copy()
-
-        result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
-        
-        logger.info("Processing complete")
-        return result_df
-    
-    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
-        """Store results back to the database using batch updates."""
-        if df.empty:
-            logger.info("No results to store")
-            return
-        
-        logger.info(f"Storing {len(df)} results")
-        
-        # Convert numpy arrays to bytes for BLOB storage
-        updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
-        con.executemany(
-            "UPDATE posts SET embedding = ? WHERE id = ?",
-            updates
-        )
-        
-        con.commit()
-        logger.info("Results stored successfully")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        This is the main entry point called by the pipeline.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing input dataframe
-            
-        Returns:
-            TransformContext with processed dataframe
-        """
-        logger.info("Starting TextEmbeddingNode transformation")
-        
-        # Get input dataframe from context
-        input_df = context.get_dataframe()
-        
-        # Validate input
-        if input_df.empty:
-            logger.warning("Empty dataframe provided to TextEmbeddingNdode")
-            return context
-        
-        if 'text' not in input_df.columns:
-            logger.warning("No 'text' column in context dataframe. Skipping TextEmbeddingNode")
-            return context
-
-        # Process the data
-        result_df = self._process_data(input_df)
-        
-        # Store results (optional)
-        self._store_results(con, result_df)
-        
-        logger.info("TextEmbeddingNode transformation complete")
-        
-        # Return new context with results
-        return TransformContext(result_df)
-
-
-class UmapNode(TransformNode):
-    """Calculates 2D coordinates from embeddings using UMAP dimensionality reduction.
-    
-    This node takes text embeddings and reduces them to 2D coordinates
-    for visualization purposes.
-    """
-    
-    def __init__(self, 
-                 n_neighbors: int = 10,
-                 min_dist: float = 0.1,
-                 n_components: int = 3,
-                 metric: str = "cosine",
-                 random_state: int = 42,
-                 model_path: str = None):
-        """Initialize the UmapNode.
-        
-        Args:
-            n_neighbors: Number of neighbors to consider for UMAP (default: 15)
-            min_dist: Minimum distance between points in low-dimensional space (default: 0.1)
-            n_components: Number of dimensions to reduce to (default: 2)
-            metric: Distance metric to use (default: 'cosine')
-            random_state: Random seed for reproducibility (default: 42)
-            model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
-        """
-        self.n_neighbors = n_neighbors
-        self.min_dist = min_dist
-        self.n_components = n_components
-        self.metric = metric
-        self.random_state = random_state
-        self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
-        self.reducer = None
-        logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
-                   f"n_components={n_components}, metric={metric}, random_state={random_state}, "
-                   f"model_path={self.model_path}")
-    
-    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Process the input dataframe.
-        
-        Retrieves embeddings from BLOB storage, converts them back to numpy arrays,
-        and applies UMAP dimensionality reduction to create 2D coordinates.
-        
-        Args:
-            df: Input dataframe from context
-            
-        Returns:
-            Processed dataframe with umap_x and umap_y columns
-        """
-        logger.info(f"Processing {len(df)} rows")
-        
-        if not UMAP_AVAILABLE:
-            raise ImportError("UMAP is required for UmapNode. Install with: pip install umap-learn")
-        
-        result_df = df.copy()
-        
-        # Convert BLOB embeddings back to numpy arrays
-        if 'embedding' not in result_df.columns:
-            logger.error("No 'embedding' column found in dataframe")
-            raise ValueError("Input dataframe must contain 'embedding' column")
-        
-        logger.info("Converting embeddings from BLOB to numpy arrays")
-        result_df['embedding'] = result_df['embedding'].apply(
-            lambda x: np.frombuffer(x, dtype=np.float32) if x is not None else None
-        )
-        
-        # Filter out rows with None embeddings
-        valid_rows = result_df['embedding'].notna()
-        if not valid_rows.any():
-            logger.error("No valid embeddings found in dataframe")
-            raise ValueError("No valid embeddings to process")
-        
-        logger.info(f"Found {valid_rows.sum()} valid embeddings out of {len(result_df)} rows")
-        
-        # Stack embeddings into a matrix
-        embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
-        logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
-        
-        # Check if a saved UMAP model exists
-        if self.model_path and os.path.exists(self.model_path):
-            logger.info(f"Loading existing UMAP model from {self.model_path}")
-            try:
-                with open(self.model_path, 'rb') as f:
-                    self.reducer = pickle.load(f)
-                logger.info("UMAP model loaded successfully")
-                umap_coords = self.reducer.transform(embeddings_matrix)
-                logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
-            except Exception as e:
-                logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
-                logger.info("Falling back to fitting a new model")
-                self.reducer = None
-        
-        # If no saved model or loading failed, fit a new model
-        if self.reducer is None:
-            logger.info("Fitting new UMAP reducer...")
-            self.reducer = umap.UMAP(
-                n_neighbors=self.n_neighbors,
-                min_dist=self.min_dist,
-                n_components=self.n_components,
-                metric=self.metric,
-                random_state=self.random_state
-            )
-            
-            umap_coords = self.reducer.fit_transform(embeddings_matrix)
-            logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
-            
-            # Save the fitted model
-            try:
-                umap_folder = '/'.join(self.model_path.split('/')[:1])
-                os.mkdir(umap_folder)
-                with open(self.model_path, 'wb') as f:
-                    pickle.dump(self.reducer, f)
-                logger.info(f"UMAP model saved to {self.model_path}")
-            except Exception as e:
-                logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
-        
-        # Add UMAP coordinates to dataframe
-        result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
-        result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
-        result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
-        
-        # Fill NaN for invalid rows
-        result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
-        result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
-        result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
-        
-        logger.info("Processing complete")
-        return result_df
-    
-    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
-        """Store UMAP coordinates back to the database.
-        
-        Args:
-            con: Database connection
-            df: Processed dataframe with umap_x and umap_y columns
-        """
-        if df.empty:
-            logger.info("No results to store")
-            return
-        
-        logger.info(f"Storing {len(df)} results")
-        
-        # Batch update UMAP coordinates
-        updates = [
-            (row['umap_x'], row['umap_y'], row['umap_z'], row['id']) 
-            for _, row in df.iterrows()
-            if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
-        ]
-        
-        if updates:
-            con.executemany(
-                "UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
-                updates
-            )
-            con.commit()
-            logger.info(f"Stored {len(updates)} UMAP coordinate pairs successfully")
-        else:
-            logger.warning("No valid UMAP coordinates to store")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        This is the main entry point called by the pipeline.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing input dataframe
-            
-        Returns:
-            TransformContext with processed dataframe
-        """
-        logger.info("Starting ExampleNode transformation")
-        
-        # Get input dataframe from context
-        input_df = context.get_dataframe()
-        
-        # Validate input
-        if input_df.empty:
-            logger.warning("Empty dataframe provided to ExampleNode")
-            return context
-        
-        # Process the data
-        result_df = self._process_data(input_df)
-        
-        # Store results (optional)
-        self._store_results(con, result_df)
-        
-        logger.info("ExampleNode transformation complete")
-        
-        # Return new context with results
-        return TransformContext(result_df)
-
-
-class SimilarityNode(TransformNode):
-    """Example transform node template.
-    
-    This node demonstrates the basic structure for creating
-    new transformation nodes in the pipeline.
-    """
-    
-    def __init__(self, 
-                 param1: str = "default_value",
-                 param2: int = 42,
-                 device: str = "cpu"):
-        """Initialize the ExampleNode.
-        
-        Args:
-            param1: Example string parameter
-            param2: Example integer parameter
-            device: Device to use for computations ('cpu', 'cuda', 'mps')
-        """
-        self.param1 = param1
-        self.param2 = param2
-        self.device = device
-        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
-    
-    def _create_tables(self, con: sqlite3.Connection):
-        """Create any necessary tables in the database.
-        
-        This is optional - only needed if your node creates new tables.
-        """
-        logger.info("Creating example tables")
-        
-        con.execute("""
-            CREATE TABLE IF NOT EXISTS example_results (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                post_id INTEGER,
-                result_value TEXT,
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                FOREIGN KEY (post_id) REFERENCES posts(id)
-            )
-        """)
-        
-        con.commit()
-    
-    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Process the input dataframe.
-        
-        This is where your main transformation logic goes.
-        
-        Args:
-            df: Input dataframe from context
-            
-        Returns:
-            Processed dataframe
-        """
-        logger.info(f"Processing {len(df)} rows")
-        
-        # Example: Add a new column based on existing data
-        result_df = df.copy()
-        result_df['processed'] = True
-        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
-        
-        logger.info("Processing complete")
-        return result_df
-    
-    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
-        """Store results back to the database.
-        
-        This is optional - only needed if you want to persist results.
-        
-        Args:
-            con: Database connection
-            df: Processed dataframe to store
-        """
-        if df.empty:
-            logger.info("No results to store")
-            return
-        
-        logger.info(f"Storing {len(df)} results")
-        
-        # Example: Store to database
-        # df[['post_id', 'result_value']].to_sql(
-        #     'example_results', 
-        #     con, 
-        #     if_exists='append', 
-        #     index=False
-        # )
-        
-        con.commit()
-        logger.info("Results stored successfully")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        This is the main entry point called by the pipeline.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing input dataframe
-            
-        Returns:
-            TransformContext with processed dataframe
-        """
-        logger.info("Starting ExampleNode transformation")
-        
-        # Get input dataframe from context
-        input_df = context.get_dataframe()
-        
-        # Validate input
-        if input_df.empty:
-            logger.warning("Empty dataframe provided to ExampleNode")
-            return context
-        
-        # Create any necessary tables
-        self._create_tables(con)
-        
-        # Process the data
-        result_df = self._process_data(input_df)
-        
-        # Store results (optional)
-        self._store_results(con, result_df)
-        
-        logger.info("ExampleNode transformation complete")
-        
-        # Return new context with results
-        return TransformContext(result_df)
-
-def main():
-
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.FileHandler("app.log"),
-            logging.StreamHandler(sys.stdout)
-        ]
-    )
-    logger = logging.getLogger("knack-transform")
-
-    con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
-    df = pd.read_sql('select * from posts;', con)
-    #node = TextEmbeddingNode(device='mps')
-    #context = TransformContext(df)
-
-    logger.info(df)
-    #new_context = node.run(con, context)
-    #logger.info(new_context.get_dataframe())
-
-    #umapNode = UmapNode()
-    #new_context = umapNode.run(con, new_context)
-
-    #logger.info(new_context.get_dataframe())
-
-    # Create 3D scatter plot of UMAP coordinates
-    result_df = df
-    
-    fig = plt.figure(figsize=(12, 9))
-    ax = fig.add_subplot(111, projection='3d')
-    
-    scatter = ax.scatter(
-        result_df['umap_x'], 
-        result_df['umap_y'], 
-        result_df['umap_z'],
-        c=result_df['id'],
-        cmap='viridis',
-        alpha=0.6,
-        s=50
-    )
-    
-    ax.set_xlabel('UMAP X')
-    ax.set_ylabel('UMAP Y')
-    ax.set_zlabel('UMAP Z')
-    ax.set_title('3D UMAP Visualization of Post Embeddings')
-    
-    plt.colorbar(scatter, ax=ax, label='Post Index')
-    plt.tight_layout()
-    plt.show()
-    
-    logger.info("3D plot displayed")
-
-
-if __name__ == '__main__': 
-    main()
--- a/transform/ensure_gliner_model.sh
+++ b/transform/ensure_gliner_model.sh
@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-if [ -d "$GLINER_MODEL_PATH" ] && [ -f "$GLINER_MODEL_PATH/config.json" ]; then
-    echo "GLiNER model already present at $GLINER_MODEL_PATH"
-    exit 0
-fi
-
-echo "Downloading GLiNER model $GLINER_MODEL_ID to $GLINER_MODEL_PATH"
-mkdir -p "$GLINER_MODEL_PATH"
-
-# Use Python with huggingface_hub for reliable model downloading
-python3 << 'EOF'
-import os
-from huggingface_hub import snapshot_download
-
-model_id = os.environ.get('GLINER_MODEL_ID')
-model_path = os.environ.get('GLINER_MODEL_PATH')
-
-if not model_id or not model_path:
-    raise ValueError(f"GLINER_MODEL_ID and GLINER_MODEL_PATH environment variables must be set")
-
-try:
-    print(f"Downloading model {model_id} to {model_path}")
-    snapshot_download(
-        repo_id=model_id,
-        cache_dir=None,  # Don't use cache, download directly
-        local_dir=model_path,
-        local_dir_use_symlinks=False  # Don't use symlinks, copy files
-    )
-    print(f"Successfully downloaded GLiNER model to {model_path}")
-except Exception as e:
-    print(f"Error downloading GLiNER model: {e}")
-    exit(1)
-EOF
--- a/transform/ensure_gte_model.sh
+++ b/transform/ensure_gte_model.sh
@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-if [ -d "$GTE_MODEL_PATH" ] && [ -f "$GTE_MODEL_PATH/config.json" ]; then
-    echo "GTE model already present at $GTE_MODEL_PATH"
-    exit 0
-fi
-
-echo "Downloading GTE model $GTE_MODEL_ID to $GTE_MODEL_PATH"
-mkdir -p "$GTE_MODEL_PATH"
-
-# Use Python with huggingface_hub for reliable model downloading
-python3 << 'EOF'
-import os
-from huggingface_hub import snapshot_download
-
-model_id = os.environ.get('GTE_MODEL_ID')
-model_path = os.environ.get('GTE_MODEL_PATH')
-
-if not model_id or not model_path:
-    raise ValueError(f"GTE_MODEL_ID and GTE_MODEL_PATH environment variables must be set")
-
-try:
-    print(f"Downloading model {model_id} to {model_path}")
-    snapshot_download(
-        repo_id=model_id,
-        cache_dir=None,  # Don't use cache, download directly
-        local_dir=model_path,
-        local_dir_use_symlinks=False  # Don't use symlinks, copy files
-    )
-    print(f"Successfully downloaded GTE model to {model_path}")
-except Exception as e:
-    print(f"Error downloading GTE model: {e}")
-    exit(1)
-EOF
--- a/transform/entrypoint.sh
+++ b/transform/entrypoint.sh
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Run model download with output to stdout/stderr
-/usr/local/bin/ensure_gte_model.sh 2>&1
-/usr/local/bin/ensure_gliner_model.sh 2>&1
-
-# Start cron in foreground with logging
-exec cron -f -L 2
-# cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1
--- a/transform/example_node.py
+++ b/transform/example_node.py
@ -1,170 +0,0 @@
-"""Example template node for the transform pipeline.
-
-This is a template showing how to create new transform nodes.
-Copy this file and modify it for your specific transformation needs.
-"""
-from pipeline import TransformContext
-from transform_node import TransformNode
-import sqlite3
-import pandas as pd
-import logging
-
-logger = logging.getLogger("knack-transform")
-
-
-class ExampleNode(TransformNode):
-    """Example transform node template.
-    
-    This node demonstrates the basic structure for creating
-    new transformation nodes in the pipeline.
-    """
-    
-    def __init__(self, 
-                 param1: str = "default_value",
-                 param2: int = 42,
-                 device: str = "cpu"):
-        """Initialize the ExampleNode.
-        
-        Args:
-            param1: Example string parameter
-            param2: Example integer parameter
-            device: Device to use for computations ('cpu', 'cuda', 'mps')
-        """
-        self.param1 = param1
-        self.param2 = param2
-        self.device = device
-        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
-    
-    def _create_tables(self, con: sqlite3.Connection):
-        """Create any necessary tables in the database.
-        
-        This is optional - only needed if your node creates new tables.
-        """
-        logger.info("Creating example tables")
-        
-        con.execute("""
-            CREATE TABLE IF NOT EXISTS example_results (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                post_id INTEGER,
-                result_value TEXT,
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                FOREIGN KEY (post_id) REFERENCES posts(id)
-            )
-        """)
-        
-        con.commit()
-    
-    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Process the input dataframe.
-        
-        This is where your main transformation logic goes.
-        
-        Args:
-            df: Input dataframe from context
-            
-        Returns:
-            Processed dataframe
-        """
-        logger.info(f"Processing {len(df)} rows")
-        
-        # Example: Add a new column based on existing data
-        result_df = df.copy()
-        result_df['processed'] = True
-        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
-        
-        logger.info("Processing complete")
-        return result_df
-    
-    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
-        """Store results back to the database.
-        
-        This is optional - only needed if you want to persist results.
-        
-        Args:
-            con: Database connection
-            df: Processed dataframe to store
-        """
-        if df.empty:
-            logger.info("No results to store")
-            return
-        
-        logger.info(f"Storing {len(df)} results")
-        
-        # Example: Store to database
-        # df[['post_id', 'result_value']].to_sql(
-        #     'example_results', 
-        #     con, 
-        #     if_exists='append', 
-        #     index=False
-        # )
-        
-        con.commit()
-        logger.info("Results stored successfully")
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        This is the main entry point called by the pipeline.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing input dataframe
-            
-        Returns:
-            TransformContext with processed dataframe
-        """
-        logger.info("Starting ExampleNode transformation")
-        
-        # Get input dataframe from context
-        input_df = context.get_dataframe()
-        
-        # Validate input
-        if input_df.empty:
-            logger.warning("Empty dataframe provided to ExampleNode")
-            return context
-        
-        # Create any necessary tables
-        self._create_tables(con)
-        
-        # Process the data
-        result_df = self._process_data(input_df)
-        
-        # Store results (optional)
-        self._store_results(con, result_df)
-        
-        logger.info("ExampleNode transformation complete")
-        
-        # Return new context with results
-        return TransformContext(result_df)
-
-
-# Example usage:
-if __name__ == "__main__":
-    # This allows you to test your node independently
-    import os
-    os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
-    
-    from pipeline import TransformContext
-    import sqlite3
-    
-    # Create test data
-    test_df = pd.DataFrame({
-        'id': [1, 2, 3],
-        'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
-    })
-    
-    # Create test database connection
-    test_con = sqlite3.connect(':memory:')
-    
-    # Create and run node
-    node = ExampleNode(param1="test", param2=100)
-    context = TransformContext(test_df)
-    result_context = node.run(test_con, context)
-    
-    # Check results
-    result_df = result_context.get_dataframe()
-    print("\nResult DataFrame:")
-    print(result_df)
-    
-    test_con.close()
-    print("\n✓ ExampleNode test completed successfully!")
--- a/transform/main.py
+++ b/transform/main.py
@ -1,147 +0,0 @@
-#! python3
-import argparse
-import logging
-import os
-import sqlite3
-import sys
-from dotenv import load_dotenv
-
-load_dotenv()
-
-if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
-    logging_level = logging.INFO
-else:
-    logging_level = logging.DEBUG
-
-logging.basicConfig(
-    level=logging_level,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler(sys.stdout)
-    ]
-)
-logger = logging.getLogger("knack-transform")
-
-
-def setup_database_connection(db_path=None):
-    """Create connection to the SQLite database."""
-    if db_path is None:
-        db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
-    logger.info(f"Connecting to database: {db_path}")
-    return sqlite3.connect(db_path)
-
-
-def table_exists(tablename: str, con: sqlite3.Connection):
-    """Check if a table exists in the database."""
-    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
-    return len(con.execute(query, [tablename]).fetchall()) > 0
-
-def run_from_database(db_path=None):
-    """Run the pipeline using database as input and output."""
-    logger.info("Starting transform pipeline (database mode)")
-    
-    try:
-        con = setup_database_connection(db_path)
-        logger.info("Database connection established")
-        
-        # Check if posts table exists
-        if not table_exists('posts', con):
-            logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
-            logger.info("Transform pipeline skipped - no data available")
-            return
-        
-        # Import transform components
-        from pipeline import create_default_pipeline, TransformContext
-        import pandas as pd
-        
-        # Load posts data
-        logger.info("Loading posts from database")
-        sql = "SELECT * FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0)"
-        # MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
-        df = pd.read_sql(sql, con)
-        logger.info(f"Loaded {len(df)} uncleaned posts with authors")
-        
-        if df.empty:
-            logger.info("No uncleaned posts found. Transform pipeline skipped.")
-            return
-        
-        # Create initial context
-        context = TransformContext(df)
-        
-        # Create and run parallel pipeline
-        device = os.environ.get('COMPUTE_DEVICE', 'cpu')
-        max_workers = int(os.environ.get('MAX_WORKERS', 4))
-        
-        pipeline = create_default_pipeline(device=device, max_workers=max_workers)
-        effective_db_path = db_path or os.environ.get('DB_PATH', '/data/knack.sqlite')
-        results = pipeline.run(
-            db_path=effective_db_path,
-            initial_context=context,
-            fail_fast=False  # Continue even if some nodes fail
-        )
-        
-        logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
-        
-        # Mark all processed posts as cleaned
-        post_ids = df['id'].tolist()
-        if post_ids:
-            placeholders = ','.join('?' * len(post_ids))
-            con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
-            con.commit()
-            logger.info(f"Marked {len(post_ids)} posts as cleaned")
-        
-    except Exception as e:
-        logger.error(f"Error in transform pipeline: {e}", exc_info=True)
-        sys.exit(1)
-    finally:
-        if 'con' in locals():
-            con.close()
-            logger.info("Database connection closed")
-
-def main():
-    """Main entry point with command-line argument support."""
-    parser = argparse.ArgumentParser(
-        description='Transform pipeline for Knack scraper data',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Run with database (Docker mode)
-  python main.py
-  
-  # Run with custom device and workers
-  python main.py --database /path/to/knack.sqlite --device mps --workers 8
-  
-  # Run with specific database file
-  python main.py --database /path/to/knack.sqlite
-        """
-    )
-    
-    parser.add_argument(
-        '--database',
-        help='Path to SQLite database (for database mode). Defaults to DB_PATH env var or /data/knack.sqlite'
-    )
-    parser.add_argument(
-        '--device',
-        default=os.environ.get('COMPUTE_DEVICE', 'cpu'),
-        choices=['cpu', 'cuda', 'mps'],
-        help='Device to use for compute-intensive operations (default: cpu)'
-    )
-    parser.add_argument(
-        '--workers',
-        type=int,
-        default=int(os.environ.get('MAX_WORKERS', 4)),
-        help='Maximum number of parallel workers (default: 4)'
-    )
-    
-    args = parser.parse_args()
-    
-    # Determine mode based on arguments
-    if args.database:
-        # Database mode (original behavior)
-        run_from_database(db_path=args.database)
-        logger.info("Database connection closed")
-
-
-if __name__ == "__main__":
-    main()
--- a/transform/pipeline.py
+++ b/transform/pipeline.py
@ -1,289 +0,0 @@
-"""Parallel pipeline orchestration for transform nodes."""
-import logging
-import os
-import sqlite3
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
-from typing import List, Dict, Optional
-
-import pandas as pd
-import multiprocessing as mp
-
-logger = logging.getLogger("knack-transform")
-
-class TransformContext:
-    """Context object containing the dataframe for transformation."""
-    # Possibly add a dict for the context to give more Information
-    
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-    
-    def get_dataframe(self) -> pd.DataFrame:
-        """Get the pandas dataframe from the context."""
-        return self.df
-
-class NodeConfig:
-    """Configuration for a transform node."""
-    
-    def __init__(self, 
-                 node_class: type,
-                 node_kwargs: Dict = None,
-                 dependencies: List[str] = None,
-                 name: str = None):
-        """Initialize node configuration.
-        
-        Args:
-            node_class: The TransformNode class to instantiate
-            node_kwargs: Keyword arguments to pass to node constructor
-            dependencies: List of node names that must complete before this one
-            name: Optional name for the node (defaults to class name)
-        """
-        self.node_class = node_class
-        self.node_kwargs = node_kwargs or {}
-        self.dependencies = dependencies or []
-        self.name = name or node_class.__name__
-
-class ParallelPipeline:
-    """Pipeline for executing transform nodes in parallel where possible.
-    
-    The pipeline analyzes dependencies between nodes and executes
-    independent nodes concurrently using multiprocessing or threading.
-    """
-    
-    def __init__(self, 
-                 max_workers: Optional[int] = None,
-                 use_processes: bool = False):
-        """Initialize the parallel pipeline.
-        
-        Args:
-            max_workers: Maximum number of parallel workers (defaults to CPU count)
-            use_processes: If True, use ProcessPoolExecutor; if False, use ThreadPoolExecutor
-        """
-        self.max_workers = max_workers or mp.cpu_count()
-        self.use_processes = use_processes
-        self.nodes: Dict[str, NodeConfig] = {}
-        logger.info(f"Initialized ParallelPipeline with {self.max_workers} workers "
-                   f"({'processes' if use_processes else 'threads'})")
-    
-    def add_node(self, config: NodeConfig):
-        """Add a node to the pipeline.
-        
-        Args:
-            config: NodeConfig with node details and dependencies
-        """
-        self.nodes[config.name] = config
-        logger.info(f"Added node '{config.name}' with dependencies: {config.dependencies}")
-    
-    def _get_execution_stages(self) -> List[List[str]]:
-        """Determine execution stages based on dependencies.
-        
-        Returns:
-            List of stages, where each stage contains node names that can run in parallel
-        """
-        stages = []
-        completed = set()
-        remaining = set(self.nodes.keys())
-        
-        while remaining:
-            # Find nodes whose dependencies are all completed
-            ready = []
-            for node_name in remaining:
-                config = self.nodes[node_name]
-                if all(dep in completed for dep in config.dependencies):
-                    ready.append(node_name)
-            
-            if not ready:
-                # Circular dependency or missing dependency
-                raise ValueError(f"Cannot resolve dependencies. Remaining nodes: {remaining}")
-            
-            stages.append(ready)
-            completed.update(ready)
-            remaining -= set(ready)
-        
-        return stages
-    
-    def _execute_node(self, 
-                      node_name: str, 
-                      db_path: str,
-                      context: TransformContext) -> tuple:
-        """Execute a single node.
-        
-        Args:
-            node_name: Name of the node to execute
-            db_path: Path to the SQLite database
-            context: TransformContext for the node
-            
-        Returns:
-            Tuple of (node_name, result_context, error)
-        """
-        try:
-            # Create fresh database connection (not shared across processes/threads)
-            con = sqlite3.connect(db_path)
-            
-            config = self.nodes[node_name]
-            node = config.node_class(**config.node_kwargs)
-            
-            logger.info(f"Executing node: {node_name}")
-            result_context = node.run(con, context)
-            
-            con.close()
-            logger.info(f"Node '{node_name}' completed successfully")
-            
-            return node_name, result_context, None
-            
-        except Exception as e:
-            logger.error(f"Error executing node '{node_name}': {e}", exc_info=True)
-            return node_name, None, str(e)
-    
-    def run(self, 
-            db_path: str,
-            initial_context: TransformContext,
-            fail_fast: bool = False) -> Dict[str, TransformContext]:
-        """Execute the pipeline.
-        
-        Args:
-            db_path: Path to the SQLite database
-            initial_context: Initial TransformContext for the pipeline
-            fail_fast: If True, stop execution on first error
-            
-        Returns:
-            Dict mapping node names to their output TransformContext
-        """
-        logger.info("Starting parallel pipeline execution")
-        
-        stages = self._get_execution_stages()
-        logger.info(f"Pipeline has {len(stages)} execution stage(s)")
-        
-        results = {}
-        errors = []
-        
-        ExecutorClass = ProcessPoolExecutor if self.use_processes else ThreadPoolExecutor
-        
-        for stage_num, stage_nodes in enumerate(stages, 1):
-            logger.info(f"Stage {stage_num}/{len(stages)}: Executing {len(stage_nodes)} node(s) in parallel: {stage_nodes}")
-            
-            # For nodes in this stage, use the context from their dependencies
-            # If multiple dependencies, we'll use the most recent one (or could merge)
-            stage_futures = {}
-            
-            with ExecutorClass(max_workers=min(self.max_workers, len(stage_nodes))) as executor:
-                for node_name in stage_nodes:
-                    config = self.nodes[node_name]
-                    
-                    # Get context from dependencies (use the last dependency's output)
-                    if config.dependencies:
-                        context = results.get(config.dependencies[-1], initial_context)
-                    else:
-                        context = initial_context
-                    
-                    future = executor.submit(self._execute_node, node_name, db_path, context)
-                    stage_futures[future] = node_name
-                
-                # Wait for all nodes in this stage to complete
-                for future in as_completed(stage_futures):
-                    node_name = stage_futures[future]
-                    name, result_context, error = future.result()
-                    
-                    if error:
-                        errors.append((name, error))
-                        if fail_fast:
-                            logger.error(f"Pipeline failed at node '{name}': {error}")
-                            raise RuntimeError(f"Node '{name}' failed: {error}")
-                    else:
-                        results[name] = result_context
-        
-        if errors:
-            logger.warning(f"Pipeline completed with {len(errors)} error(s)")
-            for name, error in errors:
-                logger.error(f"  - {name}: {error}")
-        else:
-            logger.info("Pipeline completed successfully")
-        
-        return results
-
-
-def create_default_pipeline(device: str = "cpu", 
-                            max_workers: Optional[int] = None) -> ParallelPipeline:
-    """Create a pipeline with default transform nodes.
-    
-    Args:
-        device: Device to use for compute-intensive nodes ('cpu', 'cuda', 'mps')
-        max_workers: Maximum number of parallel workers
-        
-    Returns:
-        Configured ParallelPipeline
-    """
-    from author_node import NerAuthorNode, FuzzyAuthorNode
-    from embeddings_node import TextEmbeddingNode, UmapNode
-    from url_node import URLNode
-    from to_d3_node import ToD3Node
-    
-    pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
-
-    pipeline.add_node(NodeConfig(
-        node_class=URLNode,
-        dependencies=[],
-        name='URLNode'
-    ))
-    
-    # Add AuthorNode (no dependencies)
-    pipeline.add_node(NodeConfig(
-        node_class=NerAuthorNode,
-        node_kwargs={
-            'device': device,
-            'model_path': os.environ.get('GLINER_MODEL_PATH')
-        },
-        dependencies=[],
-        name='AuthorNode'
-    ))
-
-    pipeline.add_node(NodeConfig(
-        node_class=FuzzyAuthorNode,
-        node_kwargs={
-            'max_l_dist': 1
-        },
-        dependencies=['AuthorNode'],
-        name='FuzzyAuthorNode'
-    ))
-
-    pipeline.add_node(NodeConfig(
-        node_class=TextEmbeddingNode,
-        node_kwargs={
-            'device': device,
-            'model_path': os.environ.get('GTE_MODEL_PATH')
-        },
-        dependencies=['AuthorNode'],
-        name='TextEmbeddingNode'
-    ))
-
-    pipeline.add_node(NodeConfig(
-        node_class=UmapNode,
-        node_kwargs={},
-        dependencies=['TextEmbeddingNode'],
-        name='UmapNode'
-    ))
-
-    pipeline.add_node(NodeConfig(
-        node_class=ToD3Node,
-        dependencies=[
-            'UmapNode',
-            'TextEmbeddingNode',
-            'FuzzyAuthorNode',
-            'AuthorNode',
-            'URLNode'
-        ],
-        node_kwargs={
-            'output_path': './data/json/'
-        },
-        name='ToD3Node'
-    ))
-
-    # TODO: Create Node to compute Text Embeddings and UMAP. 
-    
-    # pipeline.add_node(NodeConfig(
-    #     node_class=UMAPNode,
-    #     node_kwargs={'device': device},
-    #     dependencies=['EmbeddingNode'],  # Runs after EmbeddingNode
-    #     name='UMAPNode'
-    # ))
-    
-    return pipeline
--- a/transform/requirements.txt
+++ b/transform/requirements.txt
@ -1,9 +0,0 @@
-pandas
-python-dotenv
-gliner
-torch
-fuzzysearch
-sentence_transformers
-umap-learn
-matplotlib
-huggingface_hub
--- a/transform/to_d3_node.py
+++ b/transform/to_d3_node.py
@ -1,102 +0,0 @@
-"""Node to query data from the database and generate individual json file
-for visualisations in the d3.js framework"""
-import sqlite3
-import logging
-import json
-import os
-
-from pipeline import TransformContext
-from transform_node import TransformNode
-
-logger = logging.getLogger("knack-transform")
-
-class ToD3Node(TransformNode):
-    """Node that takes the data in a sqlite3 database and generates visualisation data
-    as json files in a specific folder.
-    """
-
-    def __init__(self, output_path: str):
-        self.output_path = output_path
-        self.queries = {
-            'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
-            'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
-            'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
-            'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
-            'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
-            'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
-            'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
-        }
-        super().__init__()
-        logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
-
-    def _query_db(self, con: sqlite3.Connection, query: str):
-        cursor = con.cursor()
-        cursor.execute(query)
-        r = [dict((cursor.description[i][0], value) \
-                for i, value in enumerate(row)) for row in cursor.fetchall()]
-        return r
-    
-    def _calculate_files(self, con: sqlite3.Connection): 
-        for key in self.queries.keys():
-            q = self._query_db(con, self.queries[key])
-            with open(f'{self.output_path}{key}.json', 'w') as f:
-                f.write(json.dumps(q))
-
-        return len(self.queries.keys())
-
-    
-    def run(self, con: sqlite3.Connection, context: TransformContext):
-        """Executes the toD3 Node
-        Writes to a bunch of files, each for each query. 
-
-        Args:
-            con (sqlite3.Connection): SQLite database connection
-            context (TransformContext): TransformContext, containing the input
-                dataframe of all post. 
-        
-        Returns:
-            TransformContext with processed dataframe.
-        """
-        logger.info("Starting ToD3Node transformation")
-
-        if not os.path.isdir(self.output_path):
-            logger.warning(f"output_dir does not exist, creating dir...")
-            os.mkdir(self.output_path)
-
-        count = self._calculate_files(con)
-
-        logger.info(f"Successfully generated {count} json files.")
-
-        return context
-
-def main():
-    import sys
-    
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler(sys.stdout)
-        ]
-    )
-    logger = logging.getLogger("knack-transform")
-    
-    # Connect to database
-    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
-    con = sqlite3.connect(db_path)
-
-    try:
-        context = TransformContext(None)
-
-        node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
-
-        context = node.run(con, context)
-
-    except Exception as e:
-        logger.error(f"Error during transformation: {e}", exc_info=True)
-        raise
-    finally:
-        con.close()
-
-if __name__ == '__main__':
-    main()
--- a/transform/transform_node.py
+++ b/transform/transform_node.py
@ -1,26 +0,0 @@
-"""Base transform node for data pipeline."""
-from abc import ABC, abstractmethod
-import sqlite3
-
-from pipeline import TransformContext
-
-class TransformNode(ABC):
-    """Abstract base class for transformation nodes.
-    
-    Each transform node implements a single transformation step
-    that takes data from the database, transforms it, and 
-    potentially writes results back.
-    """
-    
-    @abstractmethod
-    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
-        """Execute the transformation.
-        
-        Args:
-            con: SQLite database connection
-            context: TransformContext containing the input dataframe
-            
-        Returns:
-            TransformContext with the transformed dataframe
-        """
-        pass
--- a/transform/url_node.py
+++ b/transform/url_node.py
@ -1,160 +0,0 @@
-"""Nodes to extract URL in text using regex patterns."""
-import sqlite3
-import pandas as pd
-import logging
-import re
-from urllib.parse import urlparse
-
-from pipeline import TransformContext
-from transform_node import TransformNode
-
-logger = logging.getLogger("knack-transform")
-
-class URLNode(TransformNode):
-    """Node that looks for URLs in the text-column in posts.
-    Stores data in a new table urls:
-    - id, post_id, url_raw, tld, host
-    """
-
-    def __init__(self):
-        super().__init__()
-        logger.info("Init URL Node")
-
-    def _create_tables(self, con: sqlite3.Connection):
-        """Create urls table if they don't exist."""
-        con.execute("""
-            CREATE TABLE IF NOT EXISTS urls (
-                    id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    post_id INTEGER,
-                    url_raw TEXT,
-                    tld TEXT, 
-                    host TEXT,
-                    FOREIGN KEY (post_id) REFERENCES posts(id)
-            )
-                    """)
-        
-        con.commit()
-
-    def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame: 
-        logger.info(f"Processing {len(input_df)} rows")
-
-        mappings = []
-        for _, post_row in input_df.iterrows():
-            post_id = post_row['id']
-            post_text = post_row['text']
-
-            pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
-
-            urls = re.findall(pattern, post_text)
-            logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
-
-            for url in urls:
-                try:
-                    parsed = urlparse(url)
-                    hostname = parsed.netloc
-
-                    # If the hostname starts with www. remove that part.
-                    if hostname[:4] == 'www.':
-                        hostname = hostname[4:]
-                    
-                    # Extract TLD (last part after the last dot)
-                    tld = ""
-                    if hostname:
-                        parts = hostname.split('.')
-                        if len(parts) > 0:
-                            tld = parts[-1]
-                    
-                    mappings.append({
-                        'post_id': post_id,
-                        'url_raw': url,
-                        'host': hostname,
-                        'tld': tld
-                    })
-                    logger.debug(f"  URL: {url} -> Host: {hostname}, TLD: {tld}")
-                except Exception as e:
-                    logger.warning(f"Failed to parse URL {url}: {e}")
-
-        result_df = pd.DataFrame(mappings)
-        logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
-        return result_df
-
-    
-    def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame): 
-        if result_df.empty:
-            logger.info("No URLs to store")
-            return
-        
-        result_df.to_sql('urls', con, if_exists='append', index=False)
-        logger.info(f"Stored {len(result_df)} URLs to database")
-
-    def run(self, con: sqlite3.Connection, context: TransformContext):
-        """Executes the URL Node.
-        Writes to a new table urls and creates said table if it does not
-        exist currently.
-
-        Args:
-            con (sqlite3.Connection): SQLite database connection
-            context (TransformContext): Transformcontext, 
-                containing the input dataframe of all posts
-        
-        Returns:
-            TransformContext with processed dataframe.
-        """
-        logger.info("Starting URLNode transformation")
-
-        input_df = context.get_dataframe()
-
-        if input_df.empty:
-            logger.warning("Empty dataframe. Skipping URLNode")
-            return context
-        
-        self._create_tables(con)
-        result_df = self._process_data(input_df)
-        self._store_results(con, result_df)
-
-        logger.info("Node transformation complete")
-
-        return TransformContext(input_df)
-    
-def main():
-    import sys
-    
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler(sys.stdout)
-        ]
-    )
-    logger = logging.getLogger("knack-transform")
-    
-    # Connect to database
-    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
-    con = sqlite3.connect(db_path)
-    
-    try:
-        # Read posts from database
-        df = pd.read_sql('SELECT * FROM posts;', con)
-        logger.info(f"Loaded {len(df)} posts from database")
-        
-        # Create context
-        context = TransformContext(df)
-        
-        # Run NerAuthorNode
-        logger.info("Running NerAuthorNode...")
-        node = URLNode()
-        context = node.run(con, context)
-        logger.info("NerAuthorNode complete")
-        
-        
-        logger.info("All author nodes completed successfully!")
-        
-    except Exception as e:
-        logger.error(f"Error during transformation: {e}", exc_info=True)
-        raise
-    finally:
-        con.close()
-
-
-if __name__ == '__main__':
-    main()
--- a/visualisation/environment.yml
+++ b/visualisation/environment.yml
@ -1,13 +0,0 @@
-name: knack-viz
-channels:
-  - conda-forge
-  - defaults
-dependencies:
-  - python=3.11
-  - pandas>=2.0.0
-  - altair>=5.0.0
-  - notebook
-  - ipykernel
-  - pip
-  - pip:
-    - vega_datasets
--- a/visualisation/knack_visualization.ipynb
+++ b/visualisation/knack_visualization.ipynb
--- a/visualisation/tojson.ipynb
+++ b/visualisation/tojson.ipynb
@ -1,343 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "0ab5f064",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Libraries imported successfully!\n"
-     ]
-    }
-   ],
-   "source": [
-    "import sqlite3\n",
-    "from pathlib import Path\n",
-    "import json\n",
-    "\n",
-    "print(\"Libraries imported successfully!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "94b2e3d9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Tables in the database:\n",
-      "  - posttags\n",
-      "  - postcategories\n",
-      "  - tags\n",
-      "  - categories\n",
-      "  - posts\n",
-      "  - authors\n",
-      "  - post_authors\n",
-      "  - sqlite_sequence\n",
-      "  - urls\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Connect to the database\n",
-    "db_path = Path('../data/knack.sqlite')\n",
-    "conn = sqlite3.connect(db_path)\n",
-    "cursor = conn.cursor()\n",
-    "\n",
-    "# Get all table names\n",
-    "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
-    "tables = cursor.fetchall()\n",
-    "\n",
-    "print(\"Tables in the database:\")\n",
-    "for table in tables:\n",
-    "    print(f\"  - {table[0]}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "b3924728",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def query_db(query, args=(), one=False):\n",
-    "    cursor.execute(query, args)\n",
-    "    r = [dict((cursor.description[i][0], value) \\\n",
-    "               for i, value in enumerate(row)) for row in cursor.fetchall()]\n",
-    "    return (r[0] if r else None) if one else r"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "c0fdb0ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n",
-    "\n",
-    "with open('json/tags.json', 'w') as file:\n",
-    "    file.write(json.dumps(q))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "df5c31b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n",
-    "\n",
-    "with open('json/categories.json', 'w') as file:\n",
-    "    file.write(json.dumps(q))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "101b971d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = query_db(\"\"\"\n",
-    "SELECT\n",
-    "             strftime('%Y-%m', date) AS month,\n",
-    "             category,\n",
-    "             COUNT(*) AS count\n",
-    "FROM posts\n",
-    "WHERE date > '2020-01-01' AND category NOT NULL\n",
-    "GROUP BY strftime('%Y-%m', date), category\n",
-    "ORDER BY month;\n",
-    "             \"\"\")\n",
-    "\n",
-    "with open('json/posts_per_month.json', 'w') as file:\n",
-    "    file.write(json.dumps(q))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "2f23046d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = query_db(\"\"\"\n",
-    "select name,\n",
-    "    min(type) as type,\n",
-    "    count(posts.id) as count\n",
-    "from authors\n",
-    "inner join post_authors on authors.id = author_id\n",
-    "inner join posts on posts.id = post_id\n",
-    "             \n",
-    "where category NOT like '%Presseartikel%'\n",
-    "             \n",
-    "group by name\n",
-    "             \n",
-    "order by count desc\n",
-    "limit 25\n",
-    "\"\"\")\n",
-    "\n",
-    "with open('json/authors.json', 'w') as file:\n",
-    "    file.write(json.dumps(q))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "d4ae65f1",
-   "metadata": {
-    "vscode": {
-     "languageId": "ruby"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "tag_pairs = query_db(\"\"\"\n",
-    "    SELECT t1.tag AS source,\n",
-    "           t2.tag AS target,\n",
-    "           COUNT(*) AS weight\n",
-    "    FROM posttags pt1\n",
-    "    JOIN posttags pt2\n",
-    "        ON pt1.post_id = pt2.post_id\n",
-    "       AND pt1.tag_id < pt2.tag_id\n",
-    "    JOIN tags t1 ON t1.id = pt1.tag_id\n",
-    "    JOIN tags t2 ON t2.id = pt2.tag_id\n",
-    "    GROUP BY t1.tag, t2.tag\n",
-    "    HAVING weight > 1\n",
-    "    ORDER BY weight DESC;\n",
-    "\"\"\")\n",
-    "\n",
-    "with open('json/tag_chords.json', 'w') as f:\n",
-    "    f.write(json.dumps(tag_pairs))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "13062474",
-   "metadata": {
-    "vscode": {
-     "languageId": "ruby"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "q = query_db(\"\"\"\n",
-    "select\n",
-    "cast(umap_x*10 as int) as x,\n",
-    "cast(umap_y*10 as int) as y,\n",
-    "cast(umap_z*10 as int) as z,\n",
-    "posts.id as id, category_id as c,\n",
-    "SUBSTRING(title, 1, 12) as t\n",
-    "\n",
-    "from posts\n",
-    "inner join postcategories on post_id = posts.id\n",
-    "inner join categories on category_id = categories.id\n",
-    "\n",
-    "\"\"\")\n",
-    "\n",
-    "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
-    "\n",
-    "with open('json/umap_embeddings.json', 'w') as f:\n",
-    "    f.write(json.dumps(q))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "e5378b17",
-   "metadata": {
-    "vscode": {
-     "languageId": "ruby"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "q = query_db(\"\"\"\n",
-    "SELECT \n",
-    "'knack[punkt]news' AS source, \n",
-    "CASE \n",
-    "    WHEN tld_count < 10 THEN 'other'\n",
-    "    ELSE tld \n",
-    "END AS target, \n",
-    "SUM(tld_count) AS value\n",
-    "FROM (\n",
-    "    SELECT tld, COUNT(*) as tld_count\n",
-    "    FROM urls \n",
-    "    WHERE tld IS NOT NULL \n",
-    "    GROUP BY tld\n",
-    ")\n",
-    "GROUP BY target\n",
-    "\"\"\")\n",
-    "\n",
-    "q2 = query_db(\"\"\"\n",
-    "SELECT \n",
-    "    tld AS source, \n",
-    "    CASE \n",
-    "        WHEN host_count < 10 THEN 'other'\n",
-    "        ELSE host \n",
-    "    END AS target, \n",
-    "    SUM(host_count) AS value\n",
-    "FROM (\n",
-    "    SELECT tld, host, COUNT(*) as host_count\n",
-    "    FROM urls \n",
-    "    WHERE tld IS NOT NULL AND host IS NOT NULL \n",
-    "    GROUP BY tld, host\n",
-    ")\n",
-    "WHERE source != \"\" AND target != 'other'\n",
-    "GROUP BY tld, target\n",
-    "\"\"\")\n",
-    "\n",
-    "with open('json/urls_l1.json', 'w') as f:\n",
-    "    f.write(json.dumps(q))\n",
-    "\n",
-    "with open('json/urls_l2.json', 'w') as f:\n",
-    "    f.write(json.dumps(q2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "1501cb06",
-   "metadata": {
-    "vscode": {
-     "languageId": "ruby"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
-       " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
-       " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
-       " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
-       " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
-       " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
-       " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
-       " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
-       " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
-       " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
-       " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "q = query_db(\"\"\"\n",
-    "    SELECT \n",
-    "    a.name AS author_name,\n",
-    "    t.tag,\n",
-    "    COUNT(*) AS tag_count\n",
-    "FROM authors a\n",
-    "JOIN post_authors pa ON a.id = pa.author_id\n",
-    "JOIN posttags pt ON pa.post_id = pt.post_id\n",
-    "JOIN tags t ON pt.tag_id = t.id\n",
-    "WHERE a.name = 'Antifa'\n",
-    "GROUP BY a.id, a.name, t.id, t.tag\n",
-    "ORDER BY tag_count DESC;\n",
-    "\"\"\")\n",
-    "\n",
-    "q"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "knack-viz",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}