Adds Node to precalculate jsons for visualisations

Makes transformer script executable via cli
Use different embeddings model;
2026-01-29 22:08:01 +01:00 · 2026-01-27 20:19:05 +01:00 · 2026-01-18 15:43:35 +01:00 · 2025-12-24 17:58:23 +01:00 · 2025-12-23 17:53:37 +01:00 · 2025-12-21 21:18:05 +01:00
30 changed files with 4546 additions and 199 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 data/
 venv/
 experiment/
 __pycache__/
 .DS_STORE
 .env
--- a/15
+++ b/15
@ -1,15 +0,0 @@
 FROM python:slim
 RUN mkdir /app
 RUN mkdir /data
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN apt update -y
 RUN apt install -y cron
 COPY crontab .
 RUN crontab crontab
 COPY main.py .
--- a/14
+++ b/14
@ -1,2 +1,12 @@
-build:
+volume:
-	docker build -t knack-scraper .
+	docker volume create knack_data
 stop:
 	docker stop knack-scraper || true
 	docker rm knack-scraper || true
 up:
 	docker compose up -d
 down:
 	docker compose down
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
 Knack-Scraper does exacly what its name suggests it does.
 Knack-Scraper scrapes knack.news and writes to an sqlite
 database for later usage.
 ## Example for .env
 ```
 NUM_THREADS=8
 NUM_SCRAPES=100
 DATABASE_LOCATION='./data/knack.sqlite'
 ```
 ## Run once
 ```
 python main.py
 ```
--- a/1
+++ b/1
@ -1 +0,0 @@
 5 4 * * * python /app/main.py
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,60 @@
 services:
  scraper:
    build:
      context: ./scrape
      dockerfile: Dockerfile
    image: knack-scraper
    container_name: knack-scraper
    env_file:
      - scrape/.env
    volumes:
      - knack_data:/data
    restart: unless-stopped
  transform:
    build:
      context: ./transform
      dockerfile: Dockerfile
    image: knack-transform
    container_name: knack-transform
    env_file:
      - transform/.env
    volumes:
      - knack_data:/data
      - models:/models
    restart: unless-stopped
  explorer:
    build:
      context: ./explorer
      dockerfile: Dockerfile
    image: knack-explorer
    container_name: knack-explorer
    environment:
      - PORT=4173
      - SQLITE_PATH=/data/knack.sqlite
    volumes:
      - knack_data:/data:ro
    ports:
      - "4173:4173"
    depends_on:
      - transform
    restart: unless-stopped
  sqlitebrowser:
    image: lscr.io/linuxserver/sqlitebrowser:latest
    container_name: sqlitebrowser
    environment:
      - PUID=1000
      - PGID=1000
      - TZ=Etc/UTC
    volumes:
      - knack_data:/data
    ports:
      - "3000:3000" # noVNC web UI
      - "3001:3001" # VNC server
    restart: unless-stopped
 volumes:
  knack_data:
  models:
--- a/main.py
+++ b/main.py
@ -1,167 +0,0 @@
 #! python3
 import locale
 import logging
 import os
 import sqlite3
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 import pandas as pd
 import requests
 import tqdm
 from bs4 import BeautifulSoup
 logger = logging.getLogger("knack-scraper")
 # ch = logging.StreamHandler()
 # formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 # ch.setFormatter(formatter)
 # ch.setLevel(logging.INFO)
 # logger.addHandler(ch)
 def table_exists(tablename: str, con: sqlite3.Connection):
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
    return len(con.execute(query, [tablename]).fetchall()) > 0
 def download(id: int):
    if id == 0:
        return
    base_url = "https://knack.news/"
    url = f"{base_url}{id}"
    res = requests.get(url)
    # make sure we don't dos knack
    time.sleep(2)
    if not (200 <= res.status_code <= 300):
        return
    logger.info("Found promising page with id %d!", id)
    content = res.content
    soup = BeautifulSoup(content, "html.parser")
    date_format = "%d. %B %Y"
    # TODO FIXME: this fails inside the docker container
    locale.setlocale(locale.LC_TIME, "de_DE")
    pC = soup.find("div", {"class": "postContent"})
    if pC is None:
        # not a normal post
        logger.info(
            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
        )
        return
    # every post has these fields
    title = pC.find("h3", {"class": "postTitle"}).text
    postText = pC.find("div", {"class": "postText"})
    # these fields are possible but not required
    # TODO: cleanup
    try:
        date_string = pC.find("span", {"class": "singledate"}).text
        parsed_date = datetime.strptime(date_string, date_format)
    except AttributeError:
        parsed_date = None
    try:
        author = pC.find("span", {"class": "author"}).text
    except AttributeError:
        author = None
    try:
        category = pC.find("span", {"class": "categoryInfo"}).find_all()
        category = [c.text for c in category]
        category = ";".join(category)
    except AttributeError:
        category = None
    try:
        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
        tags = ";".join(tags)
    except AttributeError:
        tags = None
    img = pC.find("img", {"class": "postImage"})
    if img is not None:
        img = img["src"]
    res_dict = {
        "id": id,
        "title": title,
        "author": author,
        "date": parsed_date,
        "category": category,
        "url": url,
        "img_link": img,
        "tags": tags,
        "text": postText.text,
        "html": str(postText),
        "scraped_at": datetime.now(),
    }
    return res_dict
 def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
    res = []
    logger.info(
        "Started parallel scrape of posts from id %d to id %d using %d threads.",
        min_id,
        max_id - 1,
        num_threads,
    )
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use a list comprehension to create a list of futures
        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
        for future in tqdm.tqdm(
            futures, total=max_id - min_id
        ):  # tqdm to track progress
            post = future.result()
            if post is not None:
                res.append(post)
    # sqlite can't handle lists so let's convert them to a single row csv
    # TODO: make sure our database is properly normalized
    df = pd.DataFrame(res)
    return df
 def main():
    num_threads = int(os.environ.get("NUM_THREADS", 8))
    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
    database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite")
    con = sqlite3.connect(database_location)
    with con:
        post_table_exists = table_exists("posts", con)
        if post_table_exists:
            logger.info("found posts retrieved earlier")
            # retrieve max post id from db so
            # we can skip retrieving known posts
            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
            logger.info("Got max id %d!", max_id_in_db)
        else:
            logger.info("no posts scraped so far - starting from 0")
            # retrieve from 0 onwards
            max_id_in_db = -1
    con = sqlite3.connect(database_location)
    df = run_downloads(
        min_id=max_id_in_db + 1,
        max_id=max_id_in_db + n_scrapes,
        num_threads=num_threads,
    )
    df.to_sql("posts", con, if_exists="append")
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +0,0 @@
 beautifulsoup4==4.12.2
 certifi==2023.7.22
 charset-normalizer==3.3.0
 idna==3.4
 numpy==1.26.1
 pandas==2.1.1
 python-dateutil==2.8.2
 pytz==2023.3.post1
 requests==2.31.0
 six==1.16.0
 soupsieve==2.5
 tqdm==4.66.1
 tzdata==2023.3
 urllib3==2.0.7
--- a/scrape/Dockerfile
+++ b/scrape/Dockerfile
@ -0,0 +1,29 @@
 FROM python:slim
 RUN mkdir /app
 RUN mkdir /data
 #COPY /data/knack.sqlite /data
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY .env .
 RUN apt update -y
 RUN apt install -y cron locales
 COPY main.py .
 ENV PYTHONUNBUFFERED=1
 ENV LANG=de_DE.UTF-8
 ENV LC_ALL=de_DE.UTF-8
 # Create cron job that runs every 15 minutes with environment variables
 RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
 RUN chmod 0644 /etc/cron.d/knack-scraper
 RUN crontab /etc/cron.d/knack-scraper
 # Start cron in foreground
 CMD ["cron", "-f"]
--- a/scrape/main.py
+++ b/scrape/main.py
@ -0,0 +1,262 @@
 #! python3
 import logging
 import os
 import sqlite3
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 import sys
 from dotenv import load_dotenv
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 load_dotenv()
 if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
    logging_level = logging.INFO
 else:
    logging_level = logging.DEBUG
 logging.basicConfig(
    level=logging_level,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("app.log"),
        logging.StreamHandler(sys.stdout)
    ]
 )
 logger = logging.getLogger("knack-scraper")
 def table_exists(tablename: str, con: sqlite3.Connection):
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
    return len(con.execute(query, [tablename]).fetchall()) > 0
 def split_semicolon_list(value: str):
    if pd.isna(value):
        return []
    return [item.strip() for item in str(value).split(';') if item.strip()]
 def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
    """Extract unique dimension values and post-to-dimension mappings from a column."""
    if postdf.empty or field_name not in postdf.columns:
        return None, None
    values = set()
    mapping_rows = []
    for post_id, raw in zip(postdf['id'], postdf[field_name]):
        items = split_semicolon_list(raw)
        for item in items:
            values.add(item)
            mapping_rows.append({'post_id': post_id, dim_col: item})
    if not values:
        return None, None
    dim_df = pd.DataFrame({
        'id': range(len(values)),
        dim_col: sorted(values),
    })
    map_df = pd.DataFrame(mapping_rows)
    return dim_df, map_df
 def store_dimension_and_mapping(
    con: sqlite3.Connection,
    dim_df: pd.DataFrame | None,
    map_df: pd.DataFrame | None,
    table_name: str,
    dim_col: str,
    mapping_table: str,
    mapping_id_col: str,
 ):
    """Persist a dimension table and its mapping table, merging with existing values."""
    if dim_df is None or dim_df.empty:
        return
    if table_exists(table_name, con):
        existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
        merged = pd.concat([existing, dim_df], ignore_index=True)
        merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
        merged['id'] = range(len(merged))
    else:
        merged = dim_df.copy()
    # Replace table with merged content
    merged.to_sql(table_name, con, if_exists="replace", index=False)
    if map_df is None or map_df.empty:
        return
    value_to_id = dict(zip(merged[dim_col], merged['id']))
    map_df = map_df.copy()
    map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
    map_df = map_df[['post_id', mapping_id_col]].dropna()
    map_df.to_sql(mapping_table, con, if_exists="append", index=False)
 def download(id: int):
    if id == 0:
        return
    base_url = "https://knack.news/"
    url = f"{base_url}{id}"
    res = requests.get(url)
    # make sure we don't dos knack
    time.sleep(2)
    if not (200 <= res.status_code <= 300):
        return
    logger.debug("Found promising page with id %d!", id)
    content = res.content
    soup = BeautifulSoup(content, "html.parser")
    pC = soup.find("div", {"class": "postContent"})
    if pC is None:
        # not a normal post
        logger.debug(
            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
        )
        return
    # every post has these fields
    title = pC.find("h3", {"class": "postTitle"}).text
    postText = pC.find("div", {"class": "postText"})
    # these fields are possible but not required
    # TODO: cleanup
    try:
        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
        day = int(date_parts[0][:-1])
        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
        month = months[date_parts[1]]
        year = int(date_parts[2])
        parsed_date = datetime(year, month, day)
    except Exception:
        parsed_date = None
    try:
        author = pC.find("span", {"class": "author"}).text
    except AttributeError:
        author = None
    try:
        category = pC.find("span", {"class": "categoryInfo"}).find_all()
        category = [c.text for c in category if c.text != 'Alle Artikel']
        category = ";".join(category)
    except AttributeError:
        category = None
    try:
        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
        tags = ";".join(tags)
    except AttributeError:
        tags = None
    img = pC.find("img", {"class": "postImage"})
    if img is not None:
        img = img["src"]
    res_dict = {
        "id": id,
        "title": title,
        "author": author,
        "date": parsed_date,
        "category": category,
        "url": url,
        "img_link": img,
        "tags": tags,
        "text": postText.text,
        "html": str(postText),
        "scraped_at": datetime.now(),
        "is_cleaned": False
    }
    return res_dict
 def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
    res = []
    logger.info(
        "Started parallel scrape of posts from id %d to id %d using %d threads.",
        min_id,
        max_id - 1,
        num_threads,
    )
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use a list comprehension to create a list of futures
        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
        for future in futures:
            post = future.result()
            if post is not None:
                res.append(post)
    postdf = pd.DataFrame(res)
    return postdf
 def main():
    num_threads = int(os.environ.get("NUM_THREADS", 8))
    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
    con = sqlite3.connect(database_location)
    with con:
        if table_exists("posts", con):
            logger.info("found posts retrieved earlier")
            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
            logger.info("Got max id %d!", max_id_in_db)
        else:
            logger.info("no posts scraped so far - starting from 0")
            max_id_in_db = -1
        postdf = run_downloads(
            min_id=max_id_in_db + 1,
            max_id=max_id_in_db + n_scrapes,
            num_threads=num_threads,
        )
        # Drop category and tags columns as they're stored in separate tables
        postdf = postdf.drop(columns=['category', 'tags'])
        postdf.to_sql("posts", con, if_exists="append", index=False)
        # Tags
        tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
        store_dimension_and_mapping(
            con,
            tag_dim,
            tag_map,
            table_name="tags",
            dim_col="tag",
            mapping_table="posttags",
            mapping_id_col="tag_id",
        )
        # Categories
        category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
        store_dimension_and_mapping(
            con,
            category_dim,
            category_map,
            table_name="categories",
            dim_col="category",
            mapping_table="postcategories",
            mapping_id_col="category_id",
        )
        logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
 if __name__ == "__main__":
    main()
--- a/scrape/requirements.txt
+++ b/scrape/requirements.txt
@ -0,0 +1,4 @@
 pandas
 requests
 bs4
 dotenv
--- a/transform/.env.example
+++ b/transform/.env.example
@ -0,0 +1,4 @@
 LOGGING_LEVEL=INFO
 DB_PATH=/data/knack.sqlite
 MAX_CLEANED_POSTS=1000
 COMPUTE_DEVICE=mps
--- a/transform/Dockerfile
+++ b/transform/Dockerfile
@ -0,0 +1,50 @@
 FROM python:3.12-slim
 RUN mkdir -p /app /data /models
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    g++ \
    gfortran \
    libopenblas-dev \
    liblapack-dev \
    pkg-config \
    curl \
    && rm -rf /var/lib/apt/lists/*
 ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1
 ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1
 ENV GTE_MODEL_ID=thenlper/gte-large
 ENV GTE_MODEL_PATH=/models/thenlper/gte-large
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY .env .
 RUN apt update -y
 RUN apt install -y cron locales
 # Ensure GLiNER helper scripts are available
 COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh
 # Ensure GTE helper scripts are available
 COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh
 COPY entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh
 COPY *.py .
 # Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
 # Testing every 30 Minutes */30 * * * *
 RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
 RUN chmod 0644 /etc/cron.d/knack-transform
 RUN crontab /etc/cron.d/knack-transform
 # Persist models between container runs
 VOLUME /models
 CMD ["/usr/local/bin/entrypoint.sh"]
 #CMD ["python", "main.py"]
--- a/transform/README.md
+++ b/transform/README.md
@ -0,0 +1,67 @@
 # Knack Transform
 Data transformation pipeline for the Knack scraper project.
 ## Overview
 This folder contains the transformation logic that processes data from the SQLite database. It runs on a scheduled basis (every weekend) via cron.
 The pipeline supports **parallel execution** of independent transform nodes, allowing you to leverage multi-core processors for faster data transformation.
 ## Structure
 - `base.py` - Abstract base class for transform nodes
 - `pipeline.py` - Parallel pipeline orchestration system
 - `main.py` - Main entry point and pipeline execution
 - `author_node.py` - NER-based author classification node
 - `example_node.py` - Template for creating new nodes
 - `Dockerfile` - Docker image configuration with cron setup
 - `requirements.txt` - Python dependencies
 ## Transform Nodes
 Transform nodes inherit from `TransformNode` and implement the `run` method:
 ```python
 from base import TransformNode, TransformContext
 import sqlite3
 class MyTransform(TransformNode):
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        df = context.get_dataframe()
        # Transform logic here
        transformed_df = df.copy()
        # ... your transformations ...
        # Optionally write back to database
        transformed_df.to_sql("my_table", con, if_exists="replace", index=False)
        return TransformContext(transformed_df)
 ```
 ## Configuration
 Copy `.env.example` to `.env` and configure:
 - `LOGGING_LEVEL` - Log level (INFO or DEBUG)
 - `DB_PATH` - Path to SQLite database
 ## Running
 ### With Docker
 ```bash
 docker build -t knack-transform .
 docker run -v $(pwd)/data:/data knack-transform
 ```
 ### Locally
 ```bash
 python main.py
 ```
 ## Cron Schedule
 The Docker container runs the transform pipeline every Sunday at 3 AM.
--- a/transform/app.log
+++ b/transform/app.log
@ -0,0 +1,303 @@
 2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
 2026-01-18 15:11:40,254 - knack-transform - INFO -     index   id                                              title           author  ... embedding umap_x umap_y  row
 0       0   41                                           Über uns             None  ...         0    0.0    0.0  0.0
 1       1   52                                            Kontakt             None  ...         0    0.0    0.0  0.0
 2       2   99                                       Safety First             None  ...         0    0.0    0.0  0.0
 3       3  110  Datenleck bei Polizei Sachsen – Funkmitschnitt...    chakalaka_161  ...         0    0.0    0.0  0.0
 4       4  115  Feuriger Widerstand bei der Räumung der Tiefe ...           anonym  ...         0    0.0    0.0  0.0
 ..    ...  ...                                                ...              ...  ...       ...    ...    ...  ...
 95     10  643  Bericht vom 6. Prozesstag im Antifa-Ost Verfah...  Soli Antifa Ost  ...         0    0.0    0.0  0.0
 96     11  650  #le2310 // Aufruf Ost // Kein Freund – Kein He...           anonym  ...         0    0.0    0.0  0.0
 97     12  652  Aufruf: Am 23. Oktober von Hamburg nach Leipzi...           anonym  ...         0    0.0    0.0  0.0
 98     13  654                        Nach der Demo ging’s bergab   kreuzer online  ...         0    0.0    0.0  0.0
 99     14  659  Polizistin unterhält romantische Brieffreundsc...      Kira Ayyadi  ...         0    0.0    0.0  0.0
 [100 rows x 17 columns]
 2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation
 2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
 2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
 2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
 2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete
 2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results
 2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully
 2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete
 2026-01-18 15:11:55,335 - knack-transform - INFO -     index   id                                              title  ... umap_x umap_y  row
 0       0   41                                           Über uns  ...    0.0    0.0  0.0
 1       1   52                                            Kontakt  ...    0.0    0.0  0.0
 2       2   99                                       Safety First  ...    0.0    0.0  0.0
 3       3  110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0  0.0
 4       4  115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0  0.0
 ..    ...  ...                                                ...  ...    ...    ...  ...
 95     10  643  Bericht vom 6. Prozesstag im Antifa-Ost Verfah...  ...    0.0    0.0  0.0
 96     11  650  #le2310 // Aufruf Ost // Kein Freund – Kein He...  ...    0.0    0.0  0.0
 97     12  652  Aufruf: Am 23. Oktober von Hamburg nach Leipzi...  ...    0.0    0.0  0.0
 98     13  654                        Nach der Demo ging’s bergab  ...    0.0    0.0  0.0
 99     14  659  Polizistin unterhält romantische Brieffreundsc...  ...    0.0    0.0  0.0
 [100 rows x 17 columns]
 2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
 2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation
 2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
 2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
 2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
 2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
 2026-01-18 15:15:27,968 - knack-transform - INFO -     index    id                                              title                                  author  ... embedding umap_x umap_y  row
 0      15   672  Lina E. als Widerständlerin? CDU fordert Eingr...                                     LVZ  ...         0    0.0    0.0  0.0
 1      16   674  Unschuldig verfolgt (4): Lina E., Henry A. und...                         Michael Freitag  ...         0    0.0    0.0  0.0
 2      17   680                                      Kein Verdacht         Konrad Litschko & Andreas Speit  ...         0    0.0    0.0  0.0
 3      18   701  Jede Räumung hat ihren Preis – Aufruf von Leip...                         LeipzigBesetzen  ...         0    0.0    0.0  0.0
 4      19   703  From Berlin to Leipzig – TOGETHER IN OUR CITIE...                         interkiezionale  ...         0    0.0    0.0  0.0
 ..    ...   ...                                                ...                                     ...  ...       ...    ...    ...  ...
 95     32  1131  Nehmt ihr uns die Häuser ab, haun wir euch Gre...            G19 und BikeKitchen Freiburg  ...         0    0.0    0.0  0.0
 96     33  1136  Interview – Linksextreme aus Leipzig rechtfert...                                     MDR  ...         0    0.0    0.0  0.0
 97     34  1147  Polizei-Großaufgebot soll Sachsens Landtag sch...  sächsische Zeitung - Annette Binninger  ...         0    0.0    0.0  0.0
 98     35  1149  Fackel-Protest: Sachsens Innenminister unter D...  sächsische Zeitung - Annette Binninger  ...         0    0.0    0.0  0.0
 99     36  1154  23 Thesen über die Revolte – Wie können wir au...            anonyme*r Mensch aus Leipzig  ...         0    0.0    0.0  0.0
 [100 rows x 17 columns]
 2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation
 2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
 2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
 2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
 2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete
 2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results
 2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully
 2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete
 2026-01-18 15:15:34,885 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y  row
 0      15   672  Lina E. als Widerständlerin? CDU fordert Eingr...  ...    0.0    0.0  0.0
 1      16   674  Unschuldig verfolgt (4): Lina E., Henry A. und...  ...    0.0    0.0  0.0
 2      17   680                                      Kein Verdacht  ...    0.0    0.0  0.0
 3      18   701  Jede Räumung hat ihren Preis – Aufruf von Leip...  ...    0.0    0.0  0.0
 4      19   703  From Berlin to Leipzig – TOGETHER IN OUR CITIE...  ...    0.0    0.0  0.0
 ..    ...   ...                                                ...  ...    ...    ...  ...
 95     32  1131  Nehmt ihr uns die Häuser ab, haun wir euch Gre...  ...    0.0    0.0  0.0
 96     33  1136  Interview – Linksextreme aus Leipzig rechtfert...  ...    0.0    0.0  0.0
 97     34  1147  Polizei-Großaufgebot soll Sachsens Landtag sch...  ...    0.0    0.0  0.0
 98     35  1149  Fackel-Protest: Sachsens Innenminister unter D...  ...    0.0    0.0  0.0
 99     36  1154  23 Thesen über die Revolte – Wie können wir au...  ...    0.0    0.0  0.0
 [100 rows x 17 columns]
 2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
 2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation
 2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
 2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
 2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
 2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer...
 2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
 2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
 2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete
 2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results
 2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
 2026-01-18 15:26:34,426 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y umap_z
 0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...    0.0    0.0    0.0
 1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...    0.0    0.0    0.0
 2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...    0.0    0.0    0.0
 3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...    0.0    0.0    0.0
 4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...    0.0    0.0    0.0
 ..    ...   ...                                                ...  ...    ...    ...    ...
 95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...    0.0    0.0    0.0
 96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...    0.0    0.0    0.0
 97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...    0.0    0.0    0.0
 98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...    0.0    0.0    0.0
 99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...    0.0    0.0    0.0
 [100 rows x 17 columns]
 2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation
 2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
 2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
 2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
 2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete
 2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results
 2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully
 2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete
 2026-01-18 15:26:41,115 - knack-transform - INFO -     index    id                                              title  ... umap_x umap_y umap_z
 0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...    0.0    0.0    0.0
 1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...    0.0    0.0    0.0
 2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...    0.0    0.0    0.0
 3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...    0.0    0.0    0.0
 4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...    0.0    0.0    0.0
 ..    ...   ...                                                ...  ...    ...    ...    ...
 95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...    0.0    0.0    0.0
 96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...    0.0    0.0    0.0
 97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...    0.0    0.0    0.0
 98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...    0.0    0.0    0.0
 99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...    0.0    0.0    0.0
 [100 rows x 17 columns]
 2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
 2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation
 2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows
 2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
 2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
 2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
 2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer...
 2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
 2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
 2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete
 2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results
 2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully
 2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete
 2026-01-18 15:26:44,282 - knack-transform - INFO -     index    id                                              title  ...    umap_x    umap_y    umap_z
 0     201  1160  Unkontrollierte Corona-Demos – Der Sheriff, de...  ...  5.537961  3.468988  3.757369
 1     202  1164   AfD in Sachsen – Die gefährliche Methode der AfD  ...  4.980662  1.629360  3.269084
 2     203  1190  Wer steckt hinter den Corona-Protesten in Baut...  ...  1.055900  2.460792  2.076612
 3     204  1192  Geheimnisverrat durch LKA-Beamten nicht bestätigt  ...  4.128685  5.247468  4.904186
 4     205  1196  Hat die Polizei die Lage in Sachsen noch im Gr...  ...  5.383136  2.068369  4.368077
 ..    ...   ...                                                ...  ...       ...       ...       ...
 95    296  1735  Polizei durchsucht seit dem Morgen in Leipzig ...  ...  5.897925  5.151130  3.241154
 96    297  1740  Feuer und Flamme der Repression! Solidarität m...  ...  2.919075  5.341392  4.516587
 97    298  1745  Wieder brennendes Auto in Leipzig: SUV in Schl...  ...  4.852142  1.179675  4.241960
 98    299  1751  Ausschreitungen bei Corona-Protest im Leipzige...  ...  5.231822  4.983705  3.941314
 99    300  1761        Gericht bestätigt Verbot kurdischer Verlage  ...  0.999596  1.613693  2.039646
 [100 rows x 17 columns]
 2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed
 2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
 2026-01-18 15:28:43,420 - knack-transform - INFO -       index     id                                              title  ... umap_x umap_y umap_z
 0         1     41                                           Über uns  ...    0.0    0.0    0.0
 1         2     52                                            Kontakt  ...    0.0    0.0    0.0
 2         3     99                                       Safety First  ...    0.0    0.0    0.0
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0    0.0
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0    0.0
 ...     ...    ...                                                ...  ...    ...    ...    ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...    0.0    0.0    0.0
 3674   3675  14619                            „Klassenhass“ reloaded?  ...    0.0    0.0    0.0
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...    0.0    0.0    0.0
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...    0.0    0.0    0.0
 3677   3678  14627                        Applaus für die Angeklagten  ...    0.0    0.0    0.0
 [3678 rows x 17 columns]
 2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation
 2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows
 2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
 2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
 2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
 2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete
 2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results
 2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully
 2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete
 2026-01-18 15:30:42,374 - knack-transform - INFO -       index     id                                              title  ... umap_x umap_y umap_z
 0         1     41                                           Über uns  ...    0.0    0.0    0.0
 1         2     52                                            Kontakt  ...    0.0    0.0    0.0
 2         3     99                                       Safety First  ...    0.0    0.0    0.0
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...    0.0    0.0    0.0
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...    0.0    0.0    0.0
 ...     ...    ...                                                ...  ...    ...    ...    ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...    0.0    0.0    0.0
 3674   3675  14619                            „Klassenhass“ reloaded?  ...    0.0    0.0    0.0
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...    0.0    0.0    0.0
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...    0.0    0.0    0.0
 3677   3678  14627                        Applaus für die Angeklagten  ...    0.0    0.0    0.0
 [3678 rows x 17 columns]
 2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
 2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation
 2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows
 2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
 2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows
 2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192)
 2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer...
 2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3)
 2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
 2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete
 2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results
 2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully
 2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete
 2026-01-18 15:31:00,255 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed
 2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
 2026-01-18 15:35:37,186 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation
 2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows
 2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
 2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
 2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
 2026-01-18 15:36:25,468 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:37:37,881 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed
 2026-01-18 15:39:23,498 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:39:52,241 - knack-transform - INFO -       index     id                                              title  ...     umap_x    umap_y    umap_z
 0         1     41                                           Über uns  ...   6.138411  7.582617  9.574329
 1         2     52                                            Kontakt  ...   6.801492  5.409409  4.112970
 2         3     99                                       Safety First  ...   9.410303  7.564034  8.076056
 3         4    110  Datenleck bei Polizei Sachsen – Funkmitschnitt...  ...   3.972261  5.724514  4.036393
 4         5    115  Feuriger Widerstand bei der Räumung der Tiefe ...  ...   5.478312  5.744200  4.765834
 ...     ...    ...                                                ...  ...        ...       ...       ...
 3673   3674  14617       „Sturmlokale“ als „Vorposten im Bürgerkrieg“  ...   8.468963  5.995162  5.223534
 3674   3675  14619                            „Klassenhass“ reloaded?  ...   4.677429  8.059127  8.226499
 3675   3676  14623  Nur Bewährung: Landgericht kann Lok-Fan nach G...  ...   1.877464  8.582388  8.226753
 3676   3677  14625  Angesichts der russischen Bedrohung geben eini...  ...  12.704015  6.178788  8.685699
 3677   3678  14627                        Applaus für die Angeklagten  ...   9.530050  3.409181  8.588024
 [3678 rows x 17 columns]
 2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed
--- a/transform/author_node.py
+++ b/transform/author_node.py
@ -0,0 +1,469 @@
 """Author classification transform node using NER."""
 import os
 import sqlite3
 import pandas as pd
 import logging
 import fuzzysearch
 from concurrent.futures import ThreadPoolExecutor
 from pipeline import TransformContext
 from transform_node import TransformNode
 logger = logging.getLogger("knack-transform")
 try:
    from gliner import GLiNER
    import torch
    GLINER_AVAILABLE = True
 except ImportError:
    GLINER_AVAILABLE = False
    logging.warning("GLiNER not available. Install with: pip install gliner")
 class NerAuthorNode(TransformNode):
    """Transform node that extracts and classifies authors using NER.
    Creates two tables:
    - authors: stores unique authors with their type (Person, Organisation, etc.)
    - post_authors: maps posts to their authors
    """
    def __init__(self, model_name: str = "urchade/gliner_multi-v2.1", 
                 model_path: str = None,
                 threshold: float = 0.5, 
                 max_workers: int = 64,
                 device: str = "cpu"):
        """Initialize the AuthorNode.
        Args:
            model_name: GLiNER model to use
            model_path: Optional local path to a downloaded GLiNER model
            threshold: Confidence threshold for entity predictions
            max_workers: Number of parallel workers for prediction
            device: Device to run model on ('cpu', 'cuda', 'mps')
        """
        self.model_name = model_name
        self.model_path = model_path or os.environ.get('GLINER_MODEL_PATH')
        self.threshold = threshold
        self.max_workers = max_workers
        self.device = device
        self.model = None
        self.labels = ["Person", "Organisation", "Email", "Newspaper", "NGO"]
    def _setup_model(self):
        """Initialize the NER model."""
        if not GLINER_AVAILABLE:
            raise ImportError("GLiNER is required for AuthorNode. Install with: pip install gliner")
        model_source = None
        if self.model_path:
            if os.path.exists(self.model_path):
                model_source = self.model_path
                logger.info(f"Loading GLiNER model from local path: {self.model_path}")
            else:
                logger.warning(f"GLINER_MODEL_PATH '{self.model_path}' not found; falling back to hub model {self.model_name}")
        if model_source is None:
            model_source = self.model_name
            logger.info(f"Loading GLiNER model from hub: {self.model_name}")
        if self.device == "cuda" and torch.cuda.is_available():
            self.model = GLiNER.from_pretrained(
                model_source, 
                max_length=255
            ).to('cuda', dtype=torch.float16)
        elif self.device == "mps" and torch.backends.mps.is_available():
            self.model = GLiNER.from_pretrained(
                model_source, 
                max_length=255
            ).to('mps', dtype=torch.float16)
        else:
            self.model = GLiNER.from_pretrained(
                model_source, 
                max_length=255
            )
        logger.info("Model loaded successfully")
    def _predict(self, text_data: dict):
        """Predict entities for a single author text.
        Args:
            text_data: Dict with 'author' and 'id' keys
        Returns:
            Tuple of (predictions, post_id) or None
        """
        if text_data is None or text_data.get('author') is None:
            return None
        predictions = self.model.predict_entities(
            text_data['author'], 
            self.labels, 
            threshold=self.threshold
        )
        return predictions, text_data['id']
    def _classify_authors(self, posts_df: pd.DataFrame):
        """Classify all authors in the posts dataframe.
        Args:
            posts_df: DataFrame with 'id' and 'author' columns
        Returns:
            List of dicts with 'text', 'label', 'id' keys
        """
        if self.model is None:
            self._setup_model()
        # Prepare input data
        authors_data = []
        for idx, row in posts_df.iterrows():
            if pd.notna(row['author']):
                authors_data.append({
                    'author': row['author'],
                    'id': row['id']
                })
        logger.info(f"Classifying {len(authors_data)} authors")
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self._predict, data) for data in authors_data]
            for future in futures:
                result = future.result()
                if result is not None:
                    predictions, post_id = result
                    for pred in predictions:
                        results.append({
                            'text': pred['text'],
                            'label': pred['label'],
                            'id': post_id
                        })
        logger.info(f"Classification complete. Found {len(results)} author entities")
        return results
    def _create_tables(self, con: sqlite3.Connection):
        """Create authors and post_authors tables if they don't exist."""
        logger.info("Creating authors tables")
        con.execute("""
            CREATE TABLE IF NOT EXISTS authors (
                id INTEGER PRIMARY KEY,
                name TEXT,
                type TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        con.execute("""
            CREATE TABLE IF NOT EXISTS post_authors (
                post_id INTEGER,
                author_id INTEGER,
                PRIMARY KEY (post_id, author_id),
                FOREIGN KEY (post_id) REFERENCES posts(id),
                FOREIGN KEY (author_id) REFERENCES authors(id)
            )
        """)
        con.commit()
    def _store_authors(self, con: sqlite3.Connection, results: list):
        """Store classified authors and their mappings.
        Args:
            con: Database connection
            results: List of classification results
        """
        if not results:
            logger.info("No authors to store")
            return
        # Convert results to DataFrame
        results_df = pd.DataFrame(results)
        # Get unique authors with their types
        unique_authors = results_df[['text', 'label']].drop_duplicates()
        unique_authors.columns = ['name', 'type']
        # Get existing authors
        existing_authors = pd.read_sql("SELECT id, name FROM authors", con)
        # Find new authors to insert
        if not existing_authors.empty:
            new_authors = unique_authors[~unique_authors['name'].isin(existing_authors['name'])]
        else:
            new_authors = unique_authors
        if not new_authors.empty:
            logger.info(f"Inserting {len(new_authors)} new authors")
            new_authors.to_sql('authors', con, if_exists='append', index=False)
        # Get all authors with their IDs
        all_authors = pd.read_sql("SELECT id, name FROM authors", con)
        name_to_id = dict(zip(all_authors['name'], all_authors['id']))
        # Create post_authors mappings
        mappings = []
        for _, row in results_df.iterrows():
            author_id = name_to_id.get(row['text'])
            if author_id:
                mappings.append({
                    'post_id': row['id'],
                    'author_id': author_id
                })
        if mappings:
            mappings_df = pd.DataFrame(mappings).drop_duplicates()
            # Clear existing mappings for these posts (optional, depends on your strategy)
            # post_ids = tuple(mappings_df['post_id'].unique())
            # con.execute(f"DELETE FROM post_authors WHERE post_id IN ({','.join('?' * len(post_ids))})", post_ids)
            logger.info(f"Creating {len(mappings_df)} post-author mappings")
            mappings_df.to_sql('post_authors', con, if_exists='append', index=False)
        con.commit()
        logger.info("Authors and mappings stored successfully")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the author classification transformation.
        Args:
            con: SQLite database connection
            context: TransformContext containing posts dataframe
        Returns:
            TransformContext with classified authors dataframe
        """
        logger.info("Starting AuthorNode transformation")
        posts_df = context.get_dataframe()
        # Ensure required columns exist
        if 'author' not in posts_df.columns:
            logger.warning("No 'author' column in dataframe. Skipping AuthorNode.")
            return context
        # Create tables
        self._create_tables(con)
        # Classify authors
        results = self._classify_authors(posts_df)
        # Store results
        self._store_authors(con, results)
        # Return context with results
        logger.info("AuthorNode transformation complete")
        return TransformContext(posts_df)
 class FuzzyAuthorNode(TransformNode):
    """FuzzyAuthorNode
    This Node takes in data and rules of authornames that have been classified already
    and uses those 'rule' to find more similar fields.
    """
    def __init__(self, 
                 max_l_dist: int = 1,):
        """Initialize FuzzyAuthorNode.
        Args:
            max_l_dist: The number of 'errors' that are allowed by the fuzzy search algorithm
        """
        self.max_l_dist = max_l_dist
        logger.info(f"Initialized FuzzyAuthorNode with max_l_dist={max_l_dist}")
    def _process_data(self, con: sqlite3.Connection, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
        This is where your main transformation logic goes.
        Args:
            con: Database connection
            df: Input dataframe from context
        Returns:
            Processed dataframe
        """
        logger.info(f"Processing {len(df)} rows")
        # Retrieve all known authors from the authors table as 'rules'
        authors_df = pd.read_sql("SELECT id, name FROM authors", con)
        if authors_df.empty:
            logger.warning("No authors found in database for fuzzy matching")
            return pd.DataFrame(columns=['post_id', 'author_id'])
        # Get existing post-author mappings to avoid duplicates
        existing_mappings = pd.read_sql(
            "SELECT post_id, author_id FROM post_authors", con
        )
        existing_post_ids = set(existing_mappings['post_id'].unique())
        logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
        logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
        # Filter to posts without author mappings and with non-null author field
        if 'author' not in df.columns or 'id' not in df.columns:
            logger.warning("Missing 'author' or 'id' column in input dataframe")
            return pd.DataFrame(columns=['post_id', 'author_id'])
        posts_to_process = df[
            (df['id'].notna()) & 
            (df['author'].notna()) & 
            (~df['id'].isin(existing_post_ids))
        ]
        logger.info(f"Processing {len(posts_to_process)} posts for fuzzy matching")
        # Perform fuzzy matching
        mappings = []
        for _, post_row in posts_to_process.iterrows():
            post_id = post_row['id']
            post_author = str(post_row['author'])
            # Try to find matches against all known author names
            for _, author_row in authors_df.iterrows():
                author_id = author_row['id']
                author_name = str(author_row['name'])
                # for author names < than 2 characters I want a fault tolerance of 0!
                l_dist = self.max_l_dist if len(author_name) > 2 else 0
                # Use fuzzysearch to find matches with allowed errors
                matches = fuzzysearch.find_near_matches(
                    author_name,
                    post_author,
                    max_l_dist=l_dist, 
                )
                if matches:
                    logger.debug(f"Found fuzzy match: '{author_name}' in '{post_author}' for post {post_id}")
                    mappings.append({
                        'post_id': post_id,
                        'author_id': author_id
                    })
                    # Only take the first match per post to avoid multiple mappings
                    break
        # Create result dataframe
        result_df = pd.DataFrame(mappings, columns=['post_id', 'author_id']) if mappings else pd.DataFrame(columns=['post_id', 'author_id'])
        logger.info(f"Processing complete. Found {len(result_df)} fuzzy matches")
        return result_df
    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store results back to the database.
        Uses INSERT OR IGNORE to avoid inserting duplicates.
        Args:
            con: Database connection
            df: Processed dataframe to store
        """
        if df.empty:
            logger.info("No results to store")
            return
        logger.info(f"Storing {len(df)} results")
        # Use INSERT OR IGNORE to handle duplicates (respects PRIMARY KEY constraint)
        cursor = con.cursor()
        inserted_count = 0
        for _, row in df.iterrows():
            cursor.execute(
                "INSERT OR IGNORE INTO post_authors (post_id, author_id) VALUES (?, ?)",
                (int(row['post_id']), int(row['author_id']))
            )
            if cursor.rowcount > 0:
                inserted_count += 1
        con.commit()
        logger.info(f"Results stored successfully. Inserted {inserted_count} new mappings, skipped {len(df) - inserted_count} duplicates")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        This is the main entry point called by the pipeline.
        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe
        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting FuzzyAuthorNode transformation")
        # Get input dataframe from context
        input_df = context.get_dataframe()
        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to FuzzyAuthorNode")
            return context
        # Process the data
        result_df = self._process_data(con, input_df)
        # Store results
        self._store_results(con, result_df)
        logger.info("FuzzyAuthorNode transformation complete")
        # Return new context with results
        return TransformContext(input_df)
 def main():
    import sys
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")
    # Connect to database
    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
    con = sqlite3.connect(db_path)
    try:
        # Read posts from database
        df = pd.read_sql('SELECT * FROM posts;', con)
        logger.info(f"Loaded {len(df)} posts from database")
        # Create context
        context = TransformContext(df)
        # Run NerAuthorNode
        logger.info("Running NerAuthorNode...")
        ner_node = NerAuthorNode(device="mps")
        context = ner_node.run(con, context)
        logger.info("NerAuthorNode complete")
        # Run FuzzyAuthorNode
        logger.info("Running FuzzyAuthorNode...")
        fuzzy_node = FuzzyAuthorNode(max_l_dist=1)
        context = fuzzy_node.run(con, context)
        logger.info("FuzzyAuthorNode complete")
        logger.info("All author nodes completed successfully!")
    except Exception as e:
        logger.error(f"Error during transformation: {e}", exc_info=True)
        raise
    finally:
        con.close()
 if __name__ == '__main__':
    main()
--- a/transform/embeddings_node.py
+++ b/transform/embeddings_node.py
@ -0,0 +1,545 @@
 """Classes of Transformernodes that have to do with
 text processing.
 - TextEmbeddingNode calculates text embeddings
 - UmapNode calculates xy coordinates on those vector embeddings
 - SimilarityNode calculates top n similar posts based on those embeddings
    using the spectral distance.
 """
 from pipeline import TransformContext
 from transform_node import TransformNode
 import sqlite3
 import pandas as pd
 import logging
 import os
 import numpy as np
 import sys
 import pickle
 import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D
 logger = logging.getLogger("knack-transform")
 try: 
    from sentence_transformers import SentenceTransformer
    import torch
    GTE_AVAILABLE = True
 except ImportError:
    GTE_AVAILABLE = False
    logging.warning("GTE not available. Install with pip!")
 try:
    import umap
    UMAP_AVAILABLE = True
 except ImportError:
    UMAP_AVAILABLE = False
    logging.warning("UMAP not available. Install with pip install umap-learn!")
 class TextEmbeddingNode(TransformNode):
    """Calculates vector embeddings based on a dataframe
    of posts.
    """
    def __init__(self, 
                 model_name: str = "thenlper/gte-large",
                 model_path: str = None,
                 device: str = "cpu"):
        """Initialize the ExampleNode.
        Args:
            model_name: Name of the ML Model to calculate text embeddings
            model_path: Optional local path to a downloaded embedding model
            device: Device to use for computations ('cpu', 'cuda', 'mps')
        """
        self.model_name = model_name
        self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
        self.device = device
        self.model = None
        logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
    def _setup_model(self):
        """Init the Text Embedding Model."""
        if not GTE_AVAILABLE:
            raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
        model_source = None
        if self.model_path:
            if os.path.exists(self.model_path):
                # Check if it's a valid model directory
                if os.path.exists(os.path.join(self.model_path, 'config.json')):
                    model_source = self.model_path
                    logger.info(f"Loading GTE model from local path: {self.model_path}")
                else:
                    logger.warning(f"GTE_MODEL_PATH '{self.model_path}' found but missing config.json; Falling back to hub model {self.model_name}")
            else:
                logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
        if model_source is None:
            model_source = self.model_name
            logger.info(f"Loading GTE model from the hub: {self.model_name}")
        try:
            if self.device == "cuda" and torch.cuda.is_available():
                self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
            elif self.device == "mps" and torch.backends.mps.is_available():
                self.model = SentenceTransformer(model_source).to('mps', dtype=torch.float16)
            else:
                self.model = SentenceTransformer(model_source)
            logger.info(f"Successfully loaded GTE model from: {model_source}")
        except Exception as e:
            logger.error(f"Failed to load GTE model from {model_source}: {e}")
            raise
    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
        Calculates an embedding as a np.array. 
        Also pickles that array to prepare it to
        storage in the database.
        Args:
            df: Input dataframe from context
        Returns:
            Processed dataframe
        """
        logger.info(f"Processing {len(df)} rows")
        if self.model is None:
            self._setup_model()
        # Example: Add a new column based on existing data
        result_df = df.copy()
        result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
        logger.info("Processing complete")
        return result_df
    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store results back to the database using batch updates."""
        if df.empty:
            logger.info("No results to store")
            return
        logger.info(f"Storing {len(df)} results")
        # Convert numpy arrays to bytes for BLOB storage
        updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
        con.executemany(
            "UPDATE posts SET embedding = ? WHERE id = ?",
            updates
        )
        con.commit()
        logger.info("Results stored successfully")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        This is the main entry point called by the pipeline.
        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe
        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting TextEmbeddingNode transformation")
        # Get input dataframe from context
        input_df = context.get_dataframe()
        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to TextEmbeddingNdode")
            return context
        if 'text' not in input_df.columns:
            logger.warning("No 'text' column in context dataframe. Skipping TextEmbeddingNode")
            return context
        # Process the data
        result_df = self._process_data(input_df)
        # Store results (optional)
        self._store_results(con, result_df)
        logger.info("TextEmbeddingNode transformation complete")
        # Return new context with results
        return TransformContext(result_df)
 class UmapNode(TransformNode):
    """Calculates 2D coordinates from embeddings using UMAP dimensionality reduction.
    This node takes text embeddings and reduces them to 2D coordinates
    for visualization purposes.
    """
    def __init__(self, 
                 n_neighbors: int = 10,
                 min_dist: float = 0.1,
                 n_components: int = 3,
                 metric: str = "cosine",
                 random_state: int = 42,
                 model_path: str = None):
        """Initialize the UmapNode.
        Args:
            n_neighbors: Number of neighbors to consider for UMAP (default: 15)
            min_dist: Minimum distance between points in low-dimensional space (default: 0.1)
            n_components: Number of dimensions to reduce to (default: 2)
            metric: Distance metric to use (default: 'cosine')
            random_state: Random seed for reproducibility (default: 42)
            model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
        """
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components = n_components
        self.metric = metric
        self.random_state = random_state
        self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
        self.reducer = None
        logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
                   f"n_components={n_components}, metric={metric}, random_state={random_state}, "
                   f"model_path={self.model_path}")
    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
        Retrieves embeddings from BLOB storage, converts them back to numpy arrays,
        and applies UMAP dimensionality reduction to create 2D coordinates.
        Args:
            df: Input dataframe from context
        Returns:
            Processed dataframe with umap_x and umap_y columns
        """
        logger.info(f"Processing {len(df)} rows")
        if not UMAP_AVAILABLE:
            raise ImportError("UMAP is required for UmapNode. Install with: pip install umap-learn")
        result_df = df.copy()
        # Convert BLOB embeddings back to numpy arrays
        if 'embedding' not in result_df.columns:
            logger.error("No 'embedding' column found in dataframe")
            raise ValueError("Input dataframe must contain 'embedding' column")
        logger.info("Converting embeddings from BLOB to numpy arrays")
        result_df['embedding'] = result_df['embedding'].apply(
            lambda x: np.frombuffer(x, dtype=np.float32) if x is not None else None
        )
        # Filter out rows with None embeddings
        valid_rows = result_df['embedding'].notna()
        if not valid_rows.any():
            logger.error("No valid embeddings found in dataframe")
            raise ValueError("No valid embeddings to process")
        logger.info(f"Found {valid_rows.sum()} valid embeddings out of {len(result_df)} rows")
        # Stack embeddings into a matrix
        embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
        logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
        # Check if a saved UMAP model exists
        if self.model_path and os.path.exists(self.model_path):
            logger.info(f"Loading existing UMAP model from {self.model_path}")
            try:
                with open(self.model_path, 'rb') as f:
                    self.reducer = pickle.load(f)
                logger.info("UMAP model loaded successfully")
                umap_coords = self.reducer.transform(embeddings_matrix)
                logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
            except Exception as e:
                logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
                logger.info("Falling back to fitting a new model")
                self.reducer = None
        # If no saved model or loading failed, fit a new model
        if self.reducer is None:
            logger.info("Fitting new UMAP reducer...")
            self.reducer = umap.UMAP(
                n_neighbors=self.n_neighbors,
                min_dist=self.min_dist,
                n_components=self.n_components,
                metric=self.metric,
                random_state=self.random_state
            )
            umap_coords = self.reducer.fit_transform(embeddings_matrix)
            logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
            # Save the fitted model
            try:
                umap_folder = '/'.join(self.model_path.split('/')[:1])
                os.mkdir(umap_folder)
                with open(self.model_path, 'wb') as f:
                    pickle.dump(self.reducer, f)
                logger.info(f"UMAP model saved to {self.model_path}")
            except Exception as e:
                logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
        # Add UMAP coordinates to dataframe
        result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
        result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
        result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
        # Fill NaN for invalid rows
        result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
        result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
        result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
        logger.info("Processing complete")
        return result_df
    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store UMAP coordinates back to the database.
        Args:
            con: Database connection
            df: Processed dataframe with umap_x and umap_y columns
        """
        if df.empty:
            logger.info("No results to store")
            return
        logger.info(f"Storing {len(df)} results")
        # Batch update UMAP coordinates
        updates = [
            (row['umap_x'], row['umap_y'], row['umap_z'], row['id']) 
            for _, row in df.iterrows()
            if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
        ]
        if updates:
            con.executemany(
                "UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
                updates
            )
            con.commit()
            logger.info(f"Stored {len(updates)} UMAP coordinate pairs successfully")
        else:
            logger.warning("No valid UMAP coordinates to store")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        This is the main entry point called by the pipeline.
        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe
        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting ExampleNode transformation")
        # Get input dataframe from context
        input_df = context.get_dataframe()
        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to ExampleNode")
            return context
        # Process the data
        result_df = self._process_data(input_df)
        # Store results (optional)
        self._store_results(con, result_df)
        logger.info("ExampleNode transformation complete")
        # Return new context with results
        return TransformContext(result_df)
 class SimilarityNode(TransformNode):
    """Example transform node template.
    This node demonstrates the basic structure for creating
    new transformation nodes in the pipeline.
    """
    def __init__(self, 
                 param1: str = "default_value",
                 param2: int = 42,
                 device: str = "cpu"):
        """Initialize the ExampleNode.
        Args:
            param1: Example string parameter
            param2: Example integer parameter
            device: Device to use for computations ('cpu', 'cuda', 'mps')
        """
        self.param1 = param1
        self.param2 = param2
        self.device = device
        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
    def _create_tables(self, con: sqlite3.Connection):
        """Create any necessary tables in the database.
        This is optional - only needed if your node creates new tables.
        """
        logger.info("Creating example tables")
        con.execute("""
            CREATE TABLE IF NOT EXISTS example_results (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                post_id INTEGER,
                result_value TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (post_id) REFERENCES posts(id)
            )
        """)
        con.commit()
    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
        This is where your main transformation logic goes.
        Args:
            df: Input dataframe from context
        Returns:
            Processed dataframe
        """
        logger.info(f"Processing {len(df)} rows")
        # Example: Add a new column based on existing data
        result_df = df.copy()
        result_df['processed'] = True
        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
        logger.info("Processing complete")
        return result_df
    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store results back to the database.
        This is optional - only needed if you want to persist results.
        Args:
            con: Database connection
            df: Processed dataframe to store
        """
        if df.empty:
            logger.info("No results to store")
            return
        logger.info(f"Storing {len(df)} results")
        # Example: Store to database
        # df[['post_id', 'result_value']].to_sql(
        #     'example_results', 
        #     con, 
        #     if_exists='append', 
        #     index=False
        # )
        con.commit()
        logger.info("Results stored successfully")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        This is the main entry point called by the pipeline.
        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe
        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting ExampleNode transformation")
        # Get input dataframe from context
        input_df = context.get_dataframe()
        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to ExampleNode")
            return context
        # Create any necessary tables
        self._create_tables(con)
        # Process the data
        result_df = self._process_data(input_df)
        # Store results (optional)
        self._store_results(con, result_df)
        logger.info("ExampleNode transformation complete")
        # Return new context with results
        return TransformContext(result_df)
 def main():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler("app.log"),
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")
    con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
    df = pd.read_sql('select * from posts;', con)
    #node = TextEmbeddingNode(device='mps')
    #context = TransformContext(df)
    logger.info(df)
    #new_context = node.run(con, context)
    #logger.info(new_context.get_dataframe())
    #umapNode = UmapNode()
    #new_context = umapNode.run(con, new_context)
    #logger.info(new_context.get_dataframe())
    # Create 3D scatter plot of UMAP coordinates
    result_df = df
    fig = plt.figure(figsize=(12, 9))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(
        result_df['umap_x'], 
        result_df['umap_y'], 
        result_df['umap_z'],
        c=result_df['id'],
        cmap='viridis',
        alpha=0.6,
        s=50
    )
    ax.set_xlabel('UMAP X')
    ax.set_ylabel('UMAP Y')
    ax.set_zlabel('UMAP Z')
    ax.set_title('3D UMAP Visualization of Post Embeddings')
    plt.colorbar(scatter, ax=ax, label='Post Index')
    plt.tight_layout()
    plt.show()
    logger.info("3D plot displayed")
 if __name__ == '__main__': 
    main()
--- a/transform/ensure_gliner_model.sh
+++ b/transform/ensure_gliner_model.sh
@ -0,0 +1,35 @@
 #!/usr/bin/env bash
 set -euo pipefail
 if [ -d "$GLINER_MODEL_PATH" ] && [ -f "$GLINER_MODEL_PATH/config.json" ]; then
    echo "GLiNER model already present at $GLINER_MODEL_PATH"
    exit 0
 fi
 echo "Downloading GLiNER model $GLINER_MODEL_ID to $GLINER_MODEL_PATH"
 mkdir -p "$GLINER_MODEL_PATH"
 # Use Python with huggingface_hub for reliable model downloading
 python3 << 'EOF'
 import os
 from huggingface_hub import snapshot_download
 model_id = os.environ.get('GLINER_MODEL_ID')
 model_path = os.environ.get('GLINER_MODEL_PATH')
 if not model_id or not model_path:
    raise ValueError(f"GLINER_MODEL_ID and GLINER_MODEL_PATH environment variables must be set")
 try:
    print(f"Downloading model {model_id} to {model_path}")
    snapshot_download(
        repo_id=model_id,
        cache_dir=None,  # Don't use cache, download directly
        local_dir=model_path,
        local_dir_use_symlinks=False  # Don't use symlinks, copy files
    )
    print(f"Successfully downloaded GLiNER model to {model_path}")
 except Exception as e:
    print(f"Error downloading GLiNER model: {e}")
    exit(1)
 EOF
--- a/transform/ensure_gte_model.sh
+++ b/transform/ensure_gte_model.sh
@ -0,0 +1,35 @@
 #!/usr/bin/env bash
 set -euo pipefail
 if [ -d "$GTE_MODEL_PATH" ] && [ -f "$GTE_MODEL_PATH/config.json" ]; then
    echo "GTE model already present at $GTE_MODEL_PATH"
    exit 0
 fi
 echo "Downloading GTE model $GTE_MODEL_ID to $GTE_MODEL_PATH"
 mkdir -p "$GTE_MODEL_PATH"
 # Use Python with huggingface_hub for reliable model downloading
 python3 << 'EOF'
 import os
 from huggingface_hub import snapshot_download
 model_id = os.environ.get('GTE_MODEL_ID')
 model_path = os.environ.get('GTE_MODEL_PATH')
 if not model_id or not model_path:
    raise ValueError(f"GTE_MODEL_ID and GTE_MODEL_PATH environment variables must be set")
 try:
    print(f"Downloading model {model_id} to {model_path}")
    snapshot_download(
        repo_id=model_id,
        cache_dir=None,  # Don't use cache, download directly
        local_dir=model_path,
        local_dir_use_symlinks=False  # Don't use symlinks, copy files
    )
    print(f"Successfully downloaded GTE model to {model_path}")
 except Exception as e:
    print(f"Error downloading GTE model: {e}")
    exit(1)
 EOF
--- a/transform/entrypoint.sh
+++ b/transform/entrypoint.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Run model download with output to stdout/stderr
 /usr/local/bin/ensure_gte_model.sh 2>&1
 /usr/local/bin/ensure_gliner_model.sh 2>&1
 # Start cron in foreground with logging
 exec cron -f -L 2
 # cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1
--- a/transform/example_node.py
+++ b/transform/example_node.py
@ -0,0 +1,170 @@
 """Example template node for the transform pipeline.
 This is a template showing how to create new transform nodes.
 Copy this file and modify it for your specific transformation needs.
 """
 from pipeline import TransformContext
 from transform_node import TransformNode
 import sqlite3
 import pandas as pd
 import logging
 logger = logging.getLogger("knack-transform")
 class ExampleNode(TransformNode):
    """Example transform node template.
    This node demonstrates the basic structure for creating
    new transformation nodes in the pipeline.
    """
    def __init__(self, 
                 param1: str = "default_value",
                 param2: int = 42,
                 device: str = "cpu"):
        """Initialize the ExampleNode.
        Args:
            param1: Example string parameter
            param2: Example integer parameter
            device: Device to use for computations ('cpu', 'cuda', 'mps')
        """
        self.param1 = param1
        self.param2 = param2
        self.device = device
        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
    def _create_tables(self, con: sqlite3.Connection):
        """Create any necessary tables in the database.
        This is optional - only needed if your node creates new tables.
        """
        logger.info("Creating example tables")
        con.execute("""
            CREATE TABLE IF NOT EXISTS example_results (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                post_id INTEGER,
                result_value TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (post_id) REFERENCES posts(id)
            )
        """)
        con.commit()
    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
        This is where your main transformation logic goes.
        Args:
            df: Input dataframe from context
        Returns:
            Processed dataframe
        """
        logger.info(f"Processing {len(df)} rows")
        # Example: Add a new column based on existing data
        result_df = df.copy()
        result_df['processed'] = True
        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
        logger.info("Processing complete")
        return result_df
    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store results back to the database.
        This is optional - only needed if you want to persist results.
        Args:
            con: Database connection
            df: Processed dataframe to store
        """
        if df.empty:
            logger.info("No results to store")
            return
        logger.info(f"Storing {len(df)} results")
        # Example: Store to database
        # df[['post_id', 'result_value']].to_sql(
        #     'example_results', 
        #     con, 
        #     if_exists='append', 
        #     index=False
        # )
        con.commit()
        logger.info("Results stored successfully")
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        This is the main entry point called by the pipeline.
        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe
        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting ExampleNode transformation")
        # Get input dataframe from context
        input_df = context.get_dataframe()
        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to ExampleNode")
            return context
        # Create any necessary tables
        self._create_tables(con)
        # Process the data
        result_df = self._process_data(input_df)
        # Store results (optional)
        self._store_results(con, result_df)
        logger.info("ExampleNode transformation complete")
        # Return new context with results
        return TransformContext(result_df)
 # Example usage:
 if __name__ == "__main__":
    # This allows you to test your node independently
    import os
    os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
    from pipeline import TransformContext
    import sqlite3
    # Create test data
    test_df = pd.DataFrame({
        'id': [1, 2, 3],
        'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
    })
    # Create test database connection
    test_con = sqlite3.connect(':memory:')
    # Create and run node
    node = ExampleNode(param1="test", param2=100)
    context = TransformContext(test_df)
    result_context = node.run(test_con, context)
    # Check results
    result_df = result_context.get_dataframe()
    print("\nResult DataFrame:")
    print(result_df)
    test_con.close()
    print("\n✓ ExampleNode test completed successfully!")
--- a/transform/main.py
+++ b/transform/main.py
@ -0,0 +1,147 @@
 #! python3
 import argparse
 import logging
 import os
 import sqlite3
 import sys
 from dotenv import load_dotenv
 load_dotenv()
 if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
    logging_level = logging.INFO
 else:
    logging_level = logging.DEBUG
 logging.basicConfig(
    level=logging_level,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("app.log"),
        logging.StreamHandler(sys.stdout)
    ]
 )
 logger = logging.getLogger("knack-transform")
 def setup_database_connection(db_path=None):
    """Create connection to the SQLite database."""
    if db_path is None:
        db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
    logger.info(f"Connecting to database: {db_path}")
    return sqlite3.connect(db_path)
 def table_exists(tablename: str, con: sqlite3.Connection):
    """Check if a table exists in the database."""
    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
    return len(con.execute(query, [tablename]).fetchall()) > 0
 def run_from_database(db_path=None):
    """Run the pipeline using database as input and output."""
    logger.info("Starting transform pipeline (database mode)")
    try:
        con = setup_database_connection(db_path)
        logger.info("Database connection established")
        # Check if posts table exists
        if not table_exists('posts', con):
            logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
            logger.info("Transform pipeline skipped - no data available")
            return
        # Import transform components
        from pipeline import create_default_pipeline, TransformContext
        import pandas as pd
        # Load posts data
        logger.info("Loading posts from database")
        sql = "SELECT * FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0)"
        # MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
        df = pd.read_sql(sql, con)
        logger.info(f"Loaded {len(df)} uncleaned posts with authors")
        if df.empty:
            logger.info("No uncleaned posts found. Transform pipeline skipped.")
            return
        # Create initial context
        context = TransformContext(df)
        # Create and run parallel pipeline
        device = os.environ.get('COMPUTE_DEVICE', 'cpu')
        max_workers = int(os.environ.get('MAX_WORKERS', 4))
        pipeline = create_default_pipeline(device=device, max_workers=max_workers)
        effective_db_path = db_path or os.environ.get('DB_PATH', '/data/knack.sqlite')
        results = pipeline.run(
            db_path=effective_db_path,
            initial_context=context,
            fail_fast=False  # Continue even if some nodes fail
        )
        logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
        # Mark all processed posts as cleaned
        post_ids = df['id'].tolist()
        if post_ids:
            placeholders = ','.join('?' * len(post_ids))
            con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
            con.commit()
            logger.info(f"Marked {len(post_ids)} posts as cleaned")
    except Exception as e:
        logger.error(f"Error in transform pipeline: {e}", exc_info=True)
        sys.exit(1)
    finally:
        if 'con' in locals():
            con.close()
            logger.info("Database connection closed")
 def main():
    """Main entry point with command-line argument support."""
    parser = argparse.ArgumentParser(
        description='Transform pipeline for Knack scraper data',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Run with database (Docker mode)
  python main.py
  # Run with custom device and workers
  python main.py --database /path/to/knack.sqlite --device mps --workers 8
  # Run with specific database file
  python main.py --database /path/to/knack.sqlite
        """
    )
    parser.add_argument(
        '--database',
        help='Path to SQLite database (for database mode). Defaults to DB_PATH env var or /data/knack.sqlite'
    )
    parser.add_argument(
        '--device',
        default=os.environ.get('COMPUTE_DEVICE', 'cpu'),
        choices=['cpu', 'cuda', 'mps'],
        help='Device to use for compute-intensive operations (default: cpu)'
    )
    parser.add_argument(
        '--workers',
        type=int,
        default=int(os.environ.get('MAX_WORKERS', 4)),
        help='Maximum number of parallel workers (default: 4)'
    )
    args = parser.parse_args()
    # Determine mode based on arguments
    if args.database:
        # Database mode (original behavior)
        run_from_database(db_path=args.database)
        logger.info("Database connection closed")
 if __name__ == "__main__":
    main()
--- a/transform/pipeline.py
+++ b/transform/pipeline.py
@ -0,0 +1,289 @@
 """Parallel pipeline orchestration for transform nodes."""
 import logging
 import os
 import sqlite3
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from typing import List, Dict, Optional
 import pandas as pd
 import multiprocessing as mp
 logger = logging.getLogger("knack-transform")
 class TransformContext:
    """Context object containing the dataframe for transformation."""
    # Possibly add a dict for the context to give more Information
    def __init__(self, df: pd.DataFrame):
        self.df = df
    def get_dataframe(self) -> pd.DataFrame:
        """Get the pandas dataframe from the context."""
        return self.df
 class NodeConfig:
    """Configuration for a transform node."""
    def __init__(self, 
                 node_class: type,
                 node_kwargs: Dict = None,
                 dependencies: List[str] = None,
                 name: str = None):
        """Initialize node configuration.
        Args:
            node_class: The TransformNode class to instantiate
            node_kwargs: Keyword arguments to pass to node constructor
            dependencies: List of node names that must complete before this one
            name: Optional name for the node (defaults to class name)
        """
        self.node_class = node_class
        self.node_kwargs = node_kwargs or {}
        self.dependencies = dependencies or []
        self.name = name or node_class.__name__
 class ParallelPipeline:
    """Pipeline for executing transform nodes in parallel where possible.
    The pipeline analyzes dependencies between nodes and executes
    independent nodes concurrently using multiprocessing or threading.
    """
    def __init__(self, 
                 max_workers: Optional[int] = None,
                 use_processes: bool = False):
        """Initialize the parallel pipeline.
        Args:
            max_workers: Maximum number of parallel workers (defaults to CPU count)
            use_processes: If True, use ProcessPoolExecutor; if False, use ThreadPoolExecutor
        """
        self.max_workers = max_workers or mp.cpu_count()
        self.use_processes = use_processes
        self.nodes: Dict[str, NodeConfig] = {}
        logger.info(f"Initialized ParallelPipeline with {self.max_workers} workers "
                   f"({'processes' if use_processes else 'threads'})")
    def add_node(self, config: NodeConfig):
        """Add a node to the pipeline.
        Args:
            config: NodeConfig with node details and dependencies
        """
        self.nodes[config.name] = config
        logger.info(f"Added node '{config.name}' with dependencies: {config.dependencies}")
    def _get_execution_stages(self) -> List[List[str]]:
        """Determine execution stages based on dependencies.
        Returns:
            List of stages, where each stage contains node names that can run in parallel
        """
        stages = []
        completed = set()
        remaining = set(self.nodes.keys())
        while remaining:
            # Find nodes whose dependencies are all completed
            ready = []
            for node_name in remaining:
                config = self.nodes[node_name]
                if all(dep in completed for dep in config.dependencies):
                    ready.append(node_name)
            if not ready:
                # Circular dependency or missing dependency
                raise ValueError(f"Cannot resolve dependencies. Remaining nodes: {remaining}")
            stages.append(ready)
            completed.update(ready)
            remaining -= set(ready)
        return stages
    def _execute_node(self, 
                      node_name: str, 
                      db_path: str,
                      context: TransformContext) -> tuple:
        """Execute a single node.
        Args:
            node_name: Name of the node to execute
            db_path: Path to the SQLite database
            context: TransformContext for the node
        Returns:
            Tuple of (node_name, result_context, error)
        """
        try:
            # Create fresh database connection (not shared across processes/threads)
            con = sqlite3.connect(db_path)
            config = self.nodes[node_name]
            node = config.node_class(**config.node_kwargs)
            logger.info(f"Executing node: {node_name}")
            result_context = node.run(con, context)
            con.close()
            logger.info(f"Node '{node_name}' completed successfully")
            return node_name, result_context, None
        except Exception as e:
            logger.error(f"Error executing node '{node_name}': {e}", exc_info=True)
            return node_name, None, str(e)
    def run(self, 
            db_path: str,
            initial_context: TransformContext,
            fail_fast: bool = False) -> Dict[str, TransformContext]:
        """Execute the pipeline.
        Args:
            db_path: Path to the SQLite database
            initial_context: Initial TransformContext for the pipeline
            fail_fast: If True, stop execution on first error
        Returns:
            Dict mapping node names to their output TransformContext
        """
        logger.info("Starting parallel pipeline execution")
        stages = self._get_execution_stages()
        logger.info(f"Pipeline has {len(stages)} execution stage(s)")
        results = {}
        errors = []
        ExecutorClass = ProcessPoolExecutor if self.use_processes else ThreadPoolExecutor
        for stage_num, stage_nodes in enumerate(stages, 1):
            logger.info(f"Stage {stage_num}/{len(stages)}: Executing {len(stage_nodes)} node(s) in parallel: {stage_nodes}")
            # For nodes in this stage, use the context from their dependencies
            # If multiple dependencies, we'll use the most recent one (or could merge)
            stage_futures = {}
            with ExecutorClass(max_workers=min(self.max_workers, len(stage_nodes))) as executor:
                for node_name in stage_nodes:
                    config = self.nodes[node_name]
                    # Get context from dependencies (use the last dependency's output)
                    if config.dependencies:
                        context = results.get(config.dependencies[-1], initial_context)
                    else:
                        context = initial_context
                    future = executor.submit(self._execute_node, node_name, db_path, context)
                    stage_futures[future] = node_name
                # Wait for all nodes in this stage to complete
                for future in as_completed(stage_futures):
                    node_name = stage_futures[future]
                    name, result_context, error = future.result()
                    if error:
                        errors.append((name, error))
                        if fail_fast:
                            logger.error(f"Pipeline failed at node '{name}': {error}")
                            raise RuntimeError(f"Node '{name}' failed: {error}")
                    else:
                        results[name] = result_context
        if errors:
            logger.warning(f"Pipeline completed with {len(errors)} error(s)")
            for name, error in errors:
                logger.error(f"  - {name}: {error}")
        else:
            logger.info("Pipeline completed successfully")
        return results
 def create_default_pipeline(device: str = "cpu", 
                            max_workers: Optional[int] = None) -> ParallelPipeline:
    """Create a pipeline with default transform nodes.
    Args:
        device: Device to use for compute-intensive nodes ('cpu', 'cuda', 'mps')
        max_workers: Maximum number of parallel workers
    Returns:
        Configured ParallelPipeline
    """
    from author_node import NerAuthorNode, FuzzyAuthorNode
    from embeddings_node import TextEmbeddingNode, UmapNode
    from url_node import URLNode
    from to_d3_node import ToD3Node
    pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
    pipeline.add_node(NodeConfig(
        node_class=URLNode,
        dependencies=[],
        name='URLNode'
    ))
    # Add AuthorNode (no dependencies)
    pipeline.add_node(NodeConfig(
        node_class=NerAuthorNode,
        node_kwargs={
            'device': device,
            'model_path': os.environ.get('GLINER_MODEL_PATH')
        },
        dependencies=[],
        name='AuthorNode'
    ))
    pipeline.add_node(NodeConfig(
        node_class=FuzzyAuthorNode,
        node_kwargs={
            'max_l_dist': 1
        },
        dependencies=['AuthorNode'],
        name='FuzzyAuthorNode'
    ))
    pipeline.add_node(NodeConfig(
        node_class=TextEmbeddingNode,
        node_kwargs={
            'device': device,
            'model_path': os.environ.get('GTE_MODEL_PATH')
        },
        dependencies=['AuthorNode'],
        name='TextEmbeddingNode'
    ))
    pipeline.add_node(NodeConfig(
        node_class=UmapNode,
        node_kwargs={},
        dependencies=['TextEmbeddingNode'],
        name='UmapNode'
    ))
    pipeline.add_node(NodeConfig(
        node_class=ToD3Node,
        dependencies=[
            'UmapNode',
            'TextEmbeddingNode',
            'FuzzyAuthorNode',
            'AuthorNode',
            'URLNode'
        ],
        node_kwargs={
            'output_path': './data/json/'
        },
        name='ToD3Node'
    ))
    # TODO: Create Node to compute Text Embeddings and UMAP. 
    # pipeline.add_node(NodeConfig(
    #     node_class=UMAPNode,
    #     node_kwargs={'device': device},
    #     dependencies=['EmbeddingNode'],  # Runs after EmbeddingNode
    #     name='UMAPNode'
    # ))
    return pipeline
--- a/transform/requirements.txt
+++ b/transform/requirements.txt
@ -0,0 +1,9 @@
 pandas
 python-dotenv
 gliner
 torch
 fuzzysearch
 sentence_transformers
 umap-learn
 matplotlib
 huggingface_hub
--- a/transform/to_d3_node.py
+++ b/transform/to_d3_node.py
@ -0,0 +1,102 @@
 """Node to query data from the database and generate individual json file
 for visualisations in the d3.js framework"""
 import sqlite3
 import logging
 import json
 import os
 from pipeline import TransformContext
 from transform_node import TransformNode
 logger = logging.getLogger("knack-transform")
 class ToD3Node(TransformNode):
    """Node that takes the data in a sqlite3 database and generates visualisation data
    as json files in a specific folder.
    """
    def __init__(self, output_path: str):
        self.output_path = output_path
        self.queries = {
            'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
            'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
            'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
            'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
            'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
            'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
            'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
        }
        super().__init__()
        logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
    def _query_db(self, con: sqlite3.Connection, query: str):
        cursor = con.cursor()
        cursor.execute(query)
        r = [dict((cursor.description[i][0], value) \
                for i, value in enumerate(row)) for row in cursor.fetchall()]
        return r
    def _calculate_files(self, con: sqlite3.Connection): 
        for key in self.queries.keys():
            q = self._query_db(con, self.queries[key])
            with open(f'{self.output_path}{key}.json', 'w') as f:
                f.write(json.dumps(q))
        return len(self.queries.keys())
    def run(self, con: sqlite3.Connection, context: TransformContext):
        """Executes the toD3 Node
        Writes to a bunch of files, each for each query. 
        Args:
            con (sqlite3.Connection): SQLite database connection
            context (TransformContext): TransformContext, containing the input
                dataframe of all post. 
        Returns:
            TransformContext with processed dataframe.
        """
        logger.info("Starting ToD3Node transformation")
        if not os.path.isdir(self.output_path):
            logger.warning(f"output_dir does not exist, creating dir...")
            os.mkdir(self.output_path)
        count = self._calculate_files(con)
        logger.info(f"Successfully generated {count} json files.")
        return context
 def main():
    import sys
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")
    # Connect to database
    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
    con = sqlite3.connect(db_path)
    try:
        context = TransformContext(None)
        node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
        context = node.run(con, context)
    except Exception as e:
        logger.error(f"Error during transformation: {e}", exc_info=True)
        raise
    finally:
        con.close()
 if __name__ == '__main__':
    main()
--- a/transform/transform_node.py
+++ b/transform/transform_node.py
@ -0,0 +1,26 @@
 """Base transform node for data pipeline."""
 from abc import ABC, abstractmethod
 import sqlite3
 from pipeline import TransformContext
 class TransformNode(ABC):
    """Abstract base class for transformation nodes.
    Each transform node implements a single transformation step
    that takes data from the database, transforms it, and 
    potentially writes results back.
    """
    @abstractmethod
    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.
        Args:
            con: SQLite database connection
            context: TransformContext containing the input dataframe
        Returns:
            TransformContext with the transformed dataframe
        """
        pass
--- a/transform/url_node.py
+++ b/transform/url_node.py
@ -0,0 +1,160 @@
 """Nodes to extract URL in text using regex patterns."""
 import sqlite3
 import pandas as pd
 import logging
 import re
 from urllib.parse import urlparse
 from pipeline import TransformContext
 from transform_node import TransformNode
 logger = logging.getLogger("knack-transform")
 class URLNode(TransformNode):
    """Node that looks for URLs in the text-column in posts.
    Stores data in a new table urls:
    - id, post_id, url_raw, tld, host
    """
    def __init__(self):
        super().__init__()
        logger.info("Init URL Node")
    def _create_tables(self, con: sqlite3.Connection):
        """Create urls table if they don't exist."""
        con.execute("""
            CREATE TABLE IF NOT EXISTS urls (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    post_id INTEGER,
                    url_raw TEXT,
                    tld TEXT, 
                    host TEXT,
                    FOREIGN KEY (post_id) REFERENCES posts(id)
            )
                    """)
        con.commit()
    def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame: 
        logger.info(f"Processing {len(input_df)} rows")
        mappings = []
        for _, post_row in input_df.iterrows():
            post_id = post_row['id']
            post_text = post_row['text']
            pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
            urls = re.findall(pattern, post_text)
            logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
            for url in urls:
                try:
                    parsed = urlparse(url)
                    hostname = parsed.netloc
                    # If the hostname starts with www. remove that part.
                    if hostname[:4] == 'www.':
                        hostname = hostname[4:]
                    # Extract TLD (last part after the last dot)
                    tld = ""
                    if hostname:
                        parts = hostname.split('.')
                        if len(parts) > 0:
                            tld = parts[-1]
                    mappings.append({
                        'post_id': post_id,
                        'url_raw': url,
                        'host': hostname,
                        'tld': tld
                    })
                    logger.debug(f"  URL: {url} -> Host: {hostname}, TLD: {tld}")
                except Exception as e:
                    logger.warning(f"Failed to parse URL {url}: {e}")
        result_df = pd.DataFrame(mappings)
        logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
        return result_df
    def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame): 
        if result_df.empty:
            logger.info("No URLs to store")
            return
        result_df.to_sql('urls', con, if_exists='append', index=False)
        logger.info(f"Stored {len(result_df)} URLs to database")
    def run(self, con: sqlite3.Connection, context: TransformContext):
        """Executes the URL Node.
        Writes to a new table urls and creates said table if it does not
        exist currently.
        Args:
            con (sqlite3.Connection): SQLite database connection
            context (TransformContext): Transformcontext, 
                containing the input dataframe of all posts
        Returns:
            TransformContext with processed dataframe.
        """
        logger.info("Starting URLNode transformation")
        input_df = context.get_dataframe()
        if input_df.empty:
            logger.warning("Empty dataframe. Skipping URLNode")
            return context
        self._create_tables(con)
        result_df = self._process_data(input_df)
        self._store_results(con, result_df)
        logger.info("Node transformation complete")
        return TransformContext(input_df)
 def main():
    import sys
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")
    # Connect to database
    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
    con = sqlite3.connect(db_path)
    try:
        # Read posts from database
        df = pd.read_sql('SELECT * FROM posts;', con)
        logger.info(f"Loaded {len(df)} posts from database")
        # Create context
        context = TransformContext(df)
        # Run NerAuthorNode
        logger.info("Running NerAuthorNode...")
        node = URLNode()
        context = node.run(con, context)
        logger.info("NerAuthorNode complete")
        logger.info("All author nodes completed successfully!")
    except Exception as e:
        logger.error(f"Error during transformation: {e}", exc_info=True)
        raise
    finally:
        con.close()
 if __name__ == '__main__':
    main()
--- a/visualisation/environment.yml
+++ b/visualisation/environment.yml
@ -0,0 +1,13 @@
 name: knack-viz
 channels:
  - conda-forge
  - defaults
 dependencies:
  - python=3.11
  - pandas>=2.0.0
  - altair>=5.0.0
  - notebook
  - ipykernel
  - pip
  - pip:
    - vega_datasets
--- a/visualisation/knack_visualization.ipynb
+++ b/visualisation/knack_visualization.ipynb
--- a/visualisation/tojson.ipynb
+++ b/visualisation/tojson.ipynb
@ -0,0 +1,343 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0ab5f064",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Libraries imported successfully!\n"
     ]
    }
   ],
   "source": [
    "import sqlite3\n",
    "from pathlib import Path\n",
    "import json\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "94b2e3d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tables in the database:\n",
      "  - posttags\n",
      "  - postcategories\n",
      "  - tags\n",
      "  - categories\n",
      "  - posts\n",
      "  - authors\n",
      "  - post_authors\n",
      "  - sqlite_sequence\n",
      "  - urls\n"
     ]
    }
   ],
   "source": [
    "# Connect to the database\n",
    "db_path = Path('../data/knack.sqlite')\n",
    "conn = sqlite3.connect(db_path)\n",
    "cursor = conn.cursor()\n",
    "\n",
    "# Get all table names\n",
    "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
    "tables = cursor.fetchall()\n",
    "\n",
    "print(\"Tables in the database:\")\n",
    "for table in tables:\n",
    "    print(f\"  - {table[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b3924728",
   "metadata": {},
   "outputs": [],
   "source": [
    "def query_db(query, args=(), one=False):\n",
    "    cursor.execute(query, args)\n",
    "    r = [dict((cursor.description[i][0], value) \\\n",
    "               for i, value in enumerate(row)) for row in cursor.fetchall()]\n",
    "    return (r[0] if r else None) if one else r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c0fdb0ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n",
    "\n",
    "with open('json/tags.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "df5c31b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n",
    "\n",
    "with open('json/categories.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "101b971d",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "SELECT\n",
    "             strftime('%Y-%m', date) AS month,\n",
    "             category,\n",
    "             COUNT(*) AS count\n",
    "FROM posts\n",
    "WHERE date > '2020-01-01' AND category NOT NULL\n",
    "GROUP BY strftime('%Y-%m', date), category\n",
    "ORDER BY month;\n",
    "             \"\"\")\n",
    "\n",
    "with open('json/posts_per_month.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2f23046d",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "select name,\n",
    "    min(type) as type,\n",
    "    count(posts.id) as count\n",
    "from authors\n",
    "inner join post_authors on authors.id = author_id\n",
    "inner join posts on posts.id = post_id\n",
    "             \n",
    "where category NOT like '%Presseartikel%'\n",
    "             \n",
    "group by name\n",
    "             \n",
    "order by count desc\n",
    "limit 25\n",
    "\"\"\")\n",
    "\n",
    "with open('json/authors.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d4ae65f1",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "tag_pairs = query_db(\"\"\"\n",
    "    SELECT t1.tag AS source,\n",
    "           t2.tag AS target,\n",
    "           COUNT(*) AS weight\n",
    "    FROM posttags pt1\n",
    "    JOIN posttags pt2\n",
    "        ON pt1.post_id = pt2.post_id\n",
    "       AND pt1.tag_id < pt2.tag_id\n",
    "    JOIN tags t1 ON t1.id = pt1.tag_id\n",
    "    JOIN tags t2 ON t2.id = pt2.tag_id\n",
    "    GROUP BY t1.tag, t2.tag\n",
    "    HAVING weight > 1\n",
    "    ORDER BY weight DESC;\n",
    "\"\"\")\n",
    "\n",
    "with open('json/tag_chords.json', 'w') as f:\n",
    "    f.write(json.dumps(tag_pairs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "13062474",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "select\n",
    "cast(umap_x*10 as int) as x,\n",
    "cast(umap_y*10 as int) as y,\n",
    "cast(umap_z*10 as int) as z,\n",
    "posts.id as id, category_id as c,\n",
    "SUBSTRING(title, 1, 12) as t\n",
    "\n",
    "from posts\n",
    "inner join postcategories on post_id = posts.id\n",
    "inner join categories on category_id = categories.id\n",
    "\n",
    "\"\"\")\n",
    "\n",
    "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
    "\n",
    "with open('json/umap_embeddings.json', 'w') as f:\n",
    "    f.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e5378b17",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "SELECT \n",
    "'knack[punkt]news' AS source, \n",
    "CASE \n",
    "    WHEN tld_count < 10 THEN 'other'\n",
    "    ELSE tld \n",
    "END AS target, \n",
    "SUM(tld_count) AS value\n",
    "FROM (\n",
    "    SELECT tld, COUNT(*) as tld_count\n",
    "    FROM urls \n",
    "    WHERE tld IS NOT NULL \n",
    "    GROUP BY tld\n",
    ")\n",
    "GROUP BY target\n",
    "\"\"\")\n",
    "\n",
    "q2 = query_db(\"\"\"\n",
    "SELECT \n",
    "    tld AS source, \n",
    "    CASE \n",
    "        WHEN host_count < 10 THEN 'other'\n",
    "        ELSE host \n",
    "    END AS target, \n",
    "    SUM(host_count) AS value\n",
    "FROM (\n",
    "    SELECT tld, host, COUNT(*) as host_count\n",
    "    FROM urls \n",
    "    WHERE tld IS NOT NULL AND host IS NOT NULL \n",
    "    GROUP BY tld, host\n",
    ")\n",
    "WHERE source != \"\" AND target != 'other'\n",
    "GROUP BY tld, target\n",
    "\"\"\")\n",
    "\n",
    "with open('json/urls_l1.json', 'w') as f:\n",
    "    f.write(json.dumps(q))\n",
    "\n",
    "with open('json/urls_l2.json', 'w') as f:\n",
    "    f.write(json.dumps(q2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1501cb06",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
       " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
       " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
       " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q = query_db(\"\"\"\n",
    "    SELECT \n",
    "    a.name AS author_name,\n",
    "    t.tag,\n",
    "    COUNT(*) AS tag_count\n",
    "FROM authors a\n",
    "JOIN post_authors pa ON a.id = pa.author_id\n",
    "JOIN posttags pt ON pa.post_id = pt.post_id\n",
    "JOIN tags t ON pt.tag_id = t.id\n",
    "WHERE a.name = 'Antifa'\n",
    "GROUP BY a.id, a.name, t.id, t.tag\n",
    "ORDER BY tag_count DESC;\n",
    "\"\"\")\n",
    "\n",
    "q"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "knack-viz",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
Author	SHA1	Message	Date
quorploop	d9d0441ddd	Adds Node to precalculate jsons for visualisations	2026-01-29 22:08:01 +01:00
quorploop	7c2e34906e	Makes transformer script executable via cli	2026-01-27 20:19:05 +01:00
quorploop	8fae350b34	Use different embeddings model;	2026-01-18 15:43:35 +01:00
quorploop	49239e7e25	Implement Nodes to compute text embeddings	2025-12-24 17:58:23 +01:00
quorploop	72765532d3	Adds TransformNode to FuzzyFind Author Names	2025-12-23 17:53:37 +01:00
quorploop	64df8fb328	Implements Feature to cleanup authors freetext field	2025-12-21 21:18:05 +01:00
quorploop	bcd210ce01	Dockerized Scraper - Implements Dockerized Version of Scraper - Atomized tags and categories columns	2025-12-20 20:55:04 +01:00