Implements Feature to cleanup authors freetext field

2025-12-21 21:18:05 +01:00 · 2025-12-21 21:18:05 +01:00 · 64df8fb328
commit 64df8fb328
parent bcd210ce01
14 changed files with 804 additions and 310 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 data/
 venv/
+experiment/
 .DS_STORE
+.env
--- a/14
+++ b/14
@ -1,2 +1,12 @@
-build:
-	docker build -t knack-scraper .
+volume:
+	docker volume create knack_data
+
+stop:
+	docker stop knack-scraper || true
+	docker rm knack-scraper || true
+
+up:
+	docker compose up -d
+
+down:
+	docker compose down
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,27 @@
+services:
+  scraper:
+    build:
+      context: ./scrape
+      dockerfile: Dockerfile
+    image: knack-scraper
+    container_name: knack-scraper
+    env_file:
+      - scrape/.env
+    volumes:
+      - knack_data:/data
+    restart: unless-stopped
+
+  transform:
+    build:
+      context: ./transform
+      dockerfile: Dockerfile
+    image: knack-transform
+    container_name: knack-transform
+    env_file:
+      - transform/.env
+    volumes:
+      - knack_data:/data
+    restart: unless-stopped
+
+volumes:
+  knack_data:
--- a/main.py
+++ b/main.py
@ -1,306 +0,0 @@
-#! python3
-import logging
-import os
-import sqlite3
-import time
-from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime
-import sys
-
-from dotenv import load_dotenv
-import pandas as pd
-import requests
-import tqdm
-from bs4 import BeautifulSoup
-
-load_dotenv()
-
-if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
-    logging_level = logging.INFO
-else:
-    logging_level = logging.DEBUG
-
-logging.basicConfig(
-    level=logging_level,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler(sys.stdout)
-    ]
-)
-logger = logging.getLogger("knack-scraper")
-
-def table_exists(tablename: str, con: sqlite3.Connection):
-    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
-    return len(con.execute(query, [tablename]).fetchall()) > 0
-
-
-def download(id: int):
-    if id == 0:
-        return
-    base_url = "https://knack.news/"
-    url = f"{base_url}{id}"
-    res = requests.get(url)
-
-    # make sure we don't dos knack
-    time.sleep(2)
-
-    if not (200 <= res.status_code <= 300):
-        return
-
-    logger.debug("Found promising page with id %d!", id)
-
-    content = res.content
-    soup = BeautifulSoup(content, "html.parser")
-
-    pC = soup.find("div", {"class": "postContent"})
-
-    if pC is None:
-        # not a normal post
-        logger.debug(
-            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
-        )
-        return
-
-    # every post has these fields
-    title = pC.find("h3", {"class": "postTitle"}).text
-    postText = pC.find("div", {"class": "postText"})
-
-    # these fields are possible but not required
-    # TODO: cleanup
-    try:
-        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
-        day = int(date_parts[0][:-1])
-        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
-        month = months[date_parts[1]]
-        year = int(date_parts[2])
-        parsed_date = datetime(year, month, day)
-    except Exception:
-        parsed_date = None
-
-    try:
-        author = pC.find("span", {"class": "author"}).text
-    except AttributeError:
-        author = None
-
-    try:
-        category = pC.find("span", {"class": "categoryInfo"}).find_all()
-        category = [c.text for c in category if c.text != 'Alle Artikel']
-        category = ";".join(category)
-    except AttributeError:
-        category = None
-
-    try:
-        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
-        tags = ";".join(tags)
-    except AttributeError:
-        tags = None
-
-    img = pC.find("img", {"class": "postImage"})
-    if img is not None:
-        img = img["src"]
-
-    res_dict = {
-        "id": id,
-        "title": title,
-        "author": author,
-        "date": parsed_date,
-        "category": category,
-        "url": url,
-        "img_link": img,
-        "tags": tags,
-        "text": postText.text,
-        "html": str(postText),
-        "scraped_at": datetime.now(),
-    }
-
-    return res_dict
-
-
-def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
-    res = []
-
-    logger.info(
-        "Started parallel scrape of posts from id %d to id %d using %d threads.",
-        min_id,
-        max_id - 1,
-        num_threads,
-    )
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        # Use a list comprehension to create a list of futures
-        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
-
-        for future in tqdm.tqdm(
-            futures, total=max_id - min_id
-        ):  # tqdm to track progress
-            post = future.result()
-            if post is not None:
-                res.append(post)
-
-    # sqlite can't handle lists so let's convert them to a single row csv
-    # TODO: make sure our database is properly normalized
-    postdf = pd.DataFrame(res)
-    tagdf = None
-    posttotagdf = None
-    categorydf = None
-    postcategorydf = None
-
-    # Extract and create tags dataframe
-    if not postdf.empty and 'tags' in postdf.columns:
-        # Collect all unique tags
-        all_tags = set()
-        for tags_str in postdf['tags']:
-            if pd.notna(tags_str):
-                tags_list = [tag.strip() for tag in tags_str.split(';')]
-                all_tags.update(tags_list)
-        
-        # Create tagdf with id and text columns
-        if all_tags:
-            all_tags = sorted(list(all_tags))
-            tagdf = pd.DataFrame({
-                'id': range(len(all_tags)),
-                'tag': all_tags
-            })
-            
-            # Create posttotagdf mapping table
-            rows = []
-            for post_id, tags_str in zip(postdf['id'], postdf['tags']):
-                if pd.notna(tags_str):
-                    tags_list = [tag.strip() for tag in tags_str.split(';')]
-                    for tag_text in tags_list:
-                        tag_id = tagdf[tagdf['tag'] == tag_text]['id'].values[0]
-                        rows.append({'post_id': post_id, 'tag_id': tag_id})
-            
-            if rows:
-                posttotagdf = pd.DataFrame(rows)
-
-    # Extract and create categories dataframe
-    if not postdf.empty and 'category' in postdf.columns:
-        # Collect all unique categories
-        all_categories = set()
-        for category_str in postdf['category']:
-            if pd.notna(category_str):
-                category_list = [cat.strip() for cat in category_str.split(';')]
-                all_categories.update(category_list)
-        
-        # Create categorydf with id and category columns
-        if all_categories:
-            all_categories = sorted(list(all_categories))
-            categorydf = pd.DataFrame({
-                'id': range(len(all_categories)),
-                'category': all_categories
-            })
-            
-            # Create postcategorydf mapping table
-            rows = []
-            for post_id, category_str in zip(postdf['id'], postdf['category']):
-                if pd.notna(category_str):
-                    category_list = [cat.strip() for cat in category_str.split(';')]
-                    for category_text in category_list:
-                        category_id = categorydf[categorydf['category'] == category_text]['id'].values[0]
-                        rows.append({'post_id': post_id, 'category_id': category_id})
-            
-            if rows:
-                postcategorydf = pd.DataFrame(rows)
-
-    return postdf, tagdf, posttotagdf, categorydf, postcategorydf
-
-
-def main():
-    num_threads = int(os.environ.get("NUM_THREADS", 8))
-    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
-    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
-
-    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
-
-    con = sqlite3.connect(database_location)
-    with con:
-        post_table_exists = table_exists("posts", con)
-
-        if post_table_exists:
-            logger.info("found posts retrieved earlier")
-            # retrieve max post id from db so
-            # we can skip retrieving known posts
-            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
-            logger.info("Got max id %d!", max_id_in_db)
-        else:
-            logger.info("no posts scraped so far - starting from 0")
-            # retrieve from 0 onwards
-            max_id_in_db = -1
-
-    con = sqlite3.connect(database_location)
-    postdf, tagdf, posttotagdf, categorydf, postcategorydf = run_downloads(
-        min_id=max_id_in_db + 1,
-        max_id=max_id_in_db + n_scrapes,
-        num_threads=num_threads,
-    )
-    postdf.to_sql("posts", con, if_exists="append")
-    
-    # Handle tags dataframe merging and storage
-    if tagdf is not None and not tagdf.empty:
-        # Check if tags table already exists
-        if table_exists("tags", con):
-            # Read existing tags from database
-            existing_tagdf = pd.read_sql("SELECT id, tag FROM tags", con)
-            
-            # Merge new tags with existing tags, avoiding duplicates
-            merged_tagdf = pd.concat([existing_tagdf, tagdf], ignore_index=False)
-            merged_tagdf = merged_tagdf.drop_duplicates(subset=['tag'], keep='first')
-            merged_tagdf = merged_tagdf.reset_index(drop=True)
-            merged_tagdf['id'] = range(len(merged_tagdf))
-            
-            # Drop the old table and insert the merged data
-            con.execute("DROP TABLE tags")
-            con.commit()
-            merged_tagdf.to_sql("tags", con, if_exists="append", index=False)
-            
-            # Update tag_id references in posttotagdf
-            if posttotagdf is not None and not posttotagdf.empty:
-                #tag_mapping = dict(zip(tagdf['tag'], tagdf['id']))
-                posttotagdf['tag_id'] = posttotagdf['tag_id'].map(
-                    lambda old_id: merged_tagdf[merged_tagdf['tag'] == tagdf.loc[old_id, 'tag']]['id'].values[0]
-                )
-        else:
-            # First time creating tags table
-            tagdf.to_sql("tags", con, if_exists="append", index=False)
-    
-    # Store posttags (post to tags mapping)
-    if posttotagdf is not None and not posttotagdf.empty:
-        posttotagdf.to_sql("posttags", con, if_exists="append", index=False)
-    
-    # Handle categories dataframe merging and storage
-    if categorydf is not None and not categorydf.empty:
-        # Check if categories table already exists
-        if table_exists("categories", con):
-            # Read existing categories from database
-            existing_categorydf = pd.read_sql("SELECT id, category FROM categories", con)
-            
-            # Merge new categories with existing categories, avoiding duplicates
-            merged_categorydf = pd.concat([existing_categorydf, categorydf], ignore_index=False)
-            merged_categorydf = merged_categorydf.drop_duplicates(subset=['category'], keep='first')
-            merged_categorydf = merged_categorydf.reset_index(drop=True)
-            merged_categorydf['id'] = range(len(merged_categorydf))
-            
-            # Drop the old table and insert the merged data
-            con.execute("DROP TABLE categories")
-            con.commit()
-            merged_categorydf.to_sql("categories", con, if_exists="append", index=False)
-            
-            # Update category_id references in postcategorydf
-            if postcategorydf is not None and not postcategorydf.empty:
-                postcategorydf['category_id'] = postcategorydf['category_id'].map(
-                    lambda old_id: merged_categorydf[merged_categorydf['category'] == categorydf.loc[old_id, 'category']]['id'].values[0]
-                )
-        else:
-            # First time creating categories table
-            categorydf.to_sql("categories", con, if_exists="append", index=False)
-    
-    # Store postcategories (post to categories mapping)
-    if postcategorydf is not None and not postcategorydf.empty:
-        postcategorydf.to_sql("postcategories", con, if_exists="append", index=False)
-
-    logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
-
-
-if __name__ == "__main__":
-    main()
--- a/scrape/Dockerfile
+++ b/scrape/Dockerfile
@ -3,6 +3,8 @@ FROM python:slim
 RUN mkdir /app
 RUN mkdir /data

+#COPY /data/knack.sqlite /data
+
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@ -19,7 +21,7 @@ ENV LANG=de_DE.UTF-8
 ENV LC_ALL=de_DE.UTF-8

 # Create cron job that runs every 15 minutes with environment variables
-RUN echo "*/10 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
+RUN echo "5 4 * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-scraper
 RUN chmod 0644 /etc/cron.d/knack-scraper
 RUN crontab /etc/cron.d/knack-scraper

--- a/scrape/main.py
+++ b/scrape/main.py
@ -0,0 +1,260 @@
+#! python3
+import logging
+import os
+import sqlite3
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import sys
+
+from dotenv import load_dotenv
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+load_dotenv()
+
+if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
+    logging_level = logging.INFO
+else:
+    logging_level = logging.DEBUG
+
+logging.basicConfig(
+    level=logging_level,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("knack-scraper")
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def split_semicolon_list(value: str):
+    if pd.isna(value):
+        return []
+    return [item.strip() for item in str(value).split(';') if item.strip()]
+
+
+def build_dimension_and_mapping(postdf: pd.DataFrame, field_name: str, dim_col: str):
+    """Extract unique dimension values and post-to-dimension mappings from a column."""
+    if postdf.empty or field_name not in postdf.columns:
+        return None, None
+
+    values = set()
+    mapping_rows = []
+
+    for post_id, raw in zip(postdf['id'], postdf[field_name]):
+        items = split_semicolon_list(raw)
+        for item in items:
+            values.add(item)
+            mapping_rows.append({'post_id': post_id, dim_col: item})
+
+    if not values:
+        return None, None
+
+    dim_df = pd.DataFrame({
+        'id': range(len(values)),
+        dim_col: sorted(values),
+    })
+    map_df = pd.DataFrame(mapping_rows)
+    return dim_df, map_df
+
+
+def store_dimension_and_mapping(
+    con: sqlite3.Connection,
+    dim_df: pd.DataFrame | None,
+    map_df: pd.DataFrame | None,
+    table_name: str,
+    dim_col: str,
+    mapping_table: str,
+    mapping_id_col: str,
+):
+    """Persist a dimension table and its mapping table, merging with existing values."""
+    if dim_df is None or dim_df.empty:
+        return
+
+    if table_exists(table_name, con):
+        existing = pd.read_sql(f"SELECT id, {dim_col} FROM {table_name}", con)
+        merged = pd.concat([existing, dim_df], ignore_index=True)
+        merged = merged.drop_duplicates(subset=[dim_col], keep='first').reset_index(drop=True)
+        merged['id'] = range(len(merged))
+    else:
+        merged = dim_df.copy()
+
+    # Replace table with merged content
+    merged.to_sql(table_name, con, if_exists="replace", index=False)
+
+    if map_df is None or map_df.empty:
+        return
+
+    value_to_id = dict(zip(merged[dim_col], merged['id']))
+    map_df = map_df.copy()
+    map_df[mapping_id_col] = map_df[dim_col].map(value_to_id)
+    map_df = map_df[['post_id', mapping_id_col]].dropna()
+    map_df.to_sql(mapping_table, con, if_exists="append", index=False)
+
+
+def download(id: int):
+    if id == 0:
+        return
+    base_url = "https://knack.news/"
+    url = f"{base_url}{id}"
+    res = requests.get(url)
+
+    # make sure we don't dos knack
+    time.sleep(2)
+
+    if not (200 <= res.status_code <= 300):
+        return
+
+    logger.debug("Found promising page with id %d!", id)
+
+    content = res.content
+    soup = BeautifulSoup(content, "html.parser")
+
+    pC = soup.find("div", {"class": "postContent"})
+
+    if pC is None:
+        # not a normal post
+        logger.debug(
+            "Page with id %d does not have a .pageContent-div. Skipping for now.", id
+        )
+        return
+
+    # every post has these fields
+    title = pC.find("h3", {"class": "postTitle"}).text
+    postText = pC.find("div", {"class": "postText"})
+
+    # these fields are possible but not required
+    # TODO: cleanup
+    try:
+        date_parts = pC.find("span", {"class": "singledate"}).text.split(' ')
+        day = int(date_parts[0][:-1])
+        months = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
+        month = months[date_parts[1]]
+        year = int(date_parts[2])
+        parsed_date = datetime(year, month, day)
+    except Exception:
+        parsed_date = None
+
+    try:
+        author = pC.find("span", {"class": "author"}).text
+    except AttributeError:
+        author = None
+
+    try:
+        category = pC.find("span", {"class": "categoryInfo"}).find_all()
+        category = [c.text for c in category if c.text != 'Alle Artikel']
+        category = ";".join(category)
+    except AttributeError:
+        category = None
+
+    try:
+        tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")]
+        tags = ";".join(tags)
+    except AttributeError:
+        tags = None
+
+    img = pC.find("img", {"class": "postImage"})
+    if img is not None:
+        img = img["src"]
+
+    res_dict = {
+        "id": id,
+        "title": title,
+        "author": author,
+        "date": parsed_date,
+        "category": category,
+        "url": url,
+        "img_link": img,
+        "tags": tags,
+        "text": postText.text,
+        "html": str(postText),
+        "scraped_at": datetime.now(),
+        "is_cleaned": False
+    }
+
+    return res_dict
+
+
+def run_downloads(min_id: int, max_id: int, num_threads: int = 8):
+    res = []
+
+    logger.info(
+        "Started parallel scrape of posts from id %d to id %d using %d threads.",
+        min_id,
+        max_id - 1,
+        num_threads,
+    )
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # Use a list comprehension to create a list of futures
+        futures = [executor.submit(download, i) for i in range(min_id, max_id)]
+
+        for future in futures:
+            post = future.result()
+            if post is not None:
+                res.append(post)
+
+    postdf = pd.DataFrame(res)
+    return postdf
+
+
+def main():
+    num_threads = int(os.environ.get("NUM_THREADS", 8))
+    n_scrapes = int(os.environ.get("NUM_SCRAPES", 100))
+    database_location = os.environ.get("DATABASE_LOCATION", "../data/knack.sqlite")
+
+    logger.debug(f"Started Knack Scraper: \nNUM_THREADS: {num_threads}\nN_SCRAPES: {n_scrapes}\nDATABASE_LOCATION: {database_location}")
+
+    con = sqlite3.connect(database_location)
+    with con:
+        if table_exists("posts", con):
+            logger.info("found posts retrieved earlier")
+            max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0]
+            logger.info("Got max id %d!", max_id_in_db)
+        else:
+            logger.info("no posts scraped so far - starting from 0")
+            max_id_in_db = -1
+
+        postdf = run_downloads(
+            min_id=max_id_in_db + 1,
+            max_id=max_id_in_db + n_scrapes,
+            num_threads=num_threads,
+        )
+
+        postdf.to_sql("posts", con, if_exists="append")
+
+        # Tags
+        tag_dim, tag_map = build_dimension_and_mapping(postdf, 'tags', 'tag')
+        store_dimension_and_mapping(
+            con,
+            tag_dim,
+            tag_map,
+            table_name="tags",
+            dim_col="tag",
+            mapping_table="posttags",
+            mapping_id_col="tag_id",
+        )
+
+        # Categories
+        category_dim, category_map = build_dimension_and_mapping(postdf, 'category', 'category')
+        store_dimension_and_mapping(
+            con,
+            category_dim,
+            category_map,
+            table_name="categories",
+            dim_col="category",
+            mapping_table="postcategories",
+            mapping_id_col="category_id",
+        )
+
+        logger.info(f"scraped new entries. number of new posts: {len(postdf.index)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scrape/requirements.txt
+++ b/scrape/requirements.txt
@ -1,5 +1,4 @@
 pandas
 requests
-tqdm
 bs4
 dotenv
--- a/transform/.env.example
+++ b/transform/.env.example
@ -0,0 +1,4 @@
+LOGGING_LEVEL=INFO
+DB_PATH=/data/knack.sqlite
+MAX_CLEANED_POSTS=1000
+COMPUTE_DEVICE=mps
--- a/transform/Dockerfile
+++ b/transform/Dockerfile
@ -0,0 +1,41 @@
+FROM python:3.12-slim
+
+RUN mkdir /app
+RUN mkdir /data
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    gfortran \
+    libopenblas-dev \
+    liblapack-dev \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+#COPY /data/knack.sqlite /data
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY .env .
+
+RUN apt update -y
+RUN apt install -y cron locales
+
+COPY *.py .
+
+ENV PYTHONUNBUFFERED=1
+ENV LANG=de_DE.UTF-8
+ENV LC_ALL=de_DE.UTF-8
+
+# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
+# Testing every 30 Minutes */30 * * * *
+RUN echo "0 3 * * 0 cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
+RUN chmod 0644 /etc/cron.d/knack-transform
+RUN crontab /etc/cron.d/knack-transform
+
+# Start cron in foreground
+CMD ["cron", "-f"]
+#CMD ["python", "main.py"]
--- a/transform/README.md
+++ b/transform/README.md
@ -0,0 +1,62 @@
+# Knack Transform
+
+Data transformation pipeline for the Knack scraper project.
+
+## Overview
+
+This folder contains the transformation logic that processes data from the SQLite database. It runs on a scheduled basis (every weekend) via cron.
+
+## Structure
+
+- `base.py` - Abstract base class for transform nodes
+- `main.py` - Main entry point and pipeline orchestration
+- `Dockerfile` - Docker image configuration with cron setup
+- `requirements.txt` - Python dependencies
+
+## Transform Nodes
+
+Transform nodes inherit from `TransformNode` and implement the `run` method:
+
+```python
+from base import TransformNode, TransformContext
+import sqlite3
+
+class MyTransform(TransformNode):
+    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
+        df = context.get_dataframe()
+        
+        # Transform logic here
+        transformed_df = df.copy()
+        # ... your transformations ...
+        
+        # Optionally write back to database
+        transformed_df.to_sql("my_table", con, if_exists="replace", index=False)
+        
+        return TransformContext(transformed_df)
+```
+
+## Configuration
+
+Copy `.env.example` to `.env` and configure:
+
+- `LOGGING_LEVEL` - Log level (INFO or DEBUG)
+- `DB_PATH` - Path to SQLite database
+
+## Running
+
+### With Docker
+
+```bash
+docker build -t knack-transform .
+docker run -v $(pwd)/data:/data knack-transform
+```
+
+### Locally
+
+```bash
+python main.py
+```
+
+## Cron Schedule
+
+The Docker container runs the transform pipeline every Sunday at 3 AM.
--- a/transform/author_node.py
+++ b/transform/author_node.py
@ -0,0 +1,263 @@
+"""Author classification transform node using NER."""
+from base import TransformNode, TransformContext
+import sqlite3
+import pandas as pd
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+
+try:
+    from gliner import GLiNER
+    import torch
+    GLINER_AVAILABLE = True
+except ImportError:
+    GLINER_AVAILABLE = False
+    logging.warning("GLiNER not available. Install with: pip install gliner")
+
+logger = logging.getLogger("knack-transform")
+
+
+class AuthorNode(TransformNode):
+    """Transform node that extracts and classifies authors using NER.
+    
+    Creates two tables:
+    - authors: stores unique authors with their type (Person, Organisation, etc.)
+    - post_authors: maps posts to their authors
+    """
+    
+    def __init__(self, model_name: str = "urchade/gliner_medium-v2.1", 
+                 threshold: float = 0.5, 
+                 max_workers: int = 64,
+                 device: str = "cpu"):
+        """Initialize the AuthorNode.
+        
+        Args:
+            model_name: GLiNER model to use
+            threshold: Confidence threshold for entity predictions
+            max_workers: Number of parallel workers for prediction
+            device: Device to run model on ('cpu', 'cuda', 'mps')
+        """
+        self.model_name = model_name
+        self.threshold = threshold
+        self.max_workers = max_workers
+        self.device = device
+        self.model = None
+        self.labels = ["Person", "Organisation", "Email", "Newspaper", "NGO"]
+    
+    def _setup_model(self):
+        """Initialize the NER model."""
+        if not GLINER_AVAILABLE:
+            raise ImportError("GLiNER is required for AuthorNode. Install with: pip install gliner")
+        
+        logger.info(f"Loading GLiNER model: {self.model_name}")
+        
+        if self.device == "cuda" and torch.cuda.is_available():
+            self.model = GLiNER.from_pretrained(
+                self.model_name, 
+                max_length=255
+            ).to('cuda', dtype=torch.float16)
+        elif self.device == "mps" and torch.backends.mps.is_available():
+            self.model = GLiNER.from_pretrained(
+                self.model_name, 
+                max_length=255
+            ).to('mps', dtype=torch.float16)
+        else:
+            self.model = GLiNER.from_pretrained(
+                self.model_name, 
+                max_length=255
+            )
+        
+        logger.info("Model loaded successfully")
+    
+    def _predict(self, text_data: dict):
+        """Predict entities for a single author text.
+        
+        Args:
+            text_data: Dict with 'author' and 'id' keys
+            
+        Returns:
+            Tuple of (predictions, post_id) or None
+        """
+        if text_data is None or text_data.get('author') is None:
+            return None
+        
+        predictions = self.model.predict_entities(
+            text_data['author'], 
+            self.labels, 
+            threshold=self.threshold
+        )
+        return predictions, text_data['id']
+    
+    def _classify_authors(self, posts_df: pd.DataFrame):
+        """Classify all authors in the posts dataframe.
+        
+        Args:
+            posts_df: DataFrame with 'id' and 'author' columns
+            
+        Returns:
+            List of dicts with 'text', 'label', 'id' keys
+        """
+        if self.model is None:
+            self._setup_model()
+        
+        # Prepare input data
+        authors_data = []
+        for idx, row in posts_df.iterrows():
+            if pd.notna(row['author']):
+                authors_data.append({
+                    'author': row['author'],
+                    'id': row['id']
+                })
+        
+        logger.info(f"Classifying {len(authors_data)} authors")
+        
+        results = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = [executor.submit(self._predict, data) for data in authors_data]
+            
+            for future in futures:
+                result = future.result()
+                if result is not None:
+                    predictions, post_id = result
+                    for pred in predictions:
+                        results.append({
+                            'text': pred['text'],
+                            'label': pred['label'],
+                            'id': post_id
+                        })
+        
+        logger.info(f"Classification complete. Found {len(results)} author entities")
+        return results
+    
+    def _create_tables(self, con: sqlite3.Connection):
+        """Create authors and post_authors tables if they don't exist."""
+        logger.info("Creating authors tables")
+        
+        con.execute("""
+            CREATE TABLE IF NOT EXISTS authors (
+                id INTEGER PRIMARY KEY,
+                name TEXT,
+                type TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        
+        con.execute("""
+            CREATE TABLE IF NOT EXISTS post_authors (
+                post_id INTEGER,
+                author_id INTEGER,
+                PRIMARY KEY (post_id, author_id),
+                FOREIGN KEY (post_id) REFERENCES posts(id),
+                FOREIGN KEY (author_id) REFERENCES authors(id)
+            )
+        """)
+        
+        con.commit()
+    
+    def _store_authors(self, con: sqlite3.Connection, results: list):
+        """Store classified authors and their mappings.
+        
+        Args:
+            con: Database connection
+            results: List of classification results
+        """
+        if not results:
+            logger.info("No authors to store")
+            return
+        
+        # Convert results to DataFrame
+        results_df = pd.DataFrame(results)
+        
+        # Get unique authors with their types
+        unique_authors = results_df[['text', 'label']].drop_duplicates()
+        unique_authors.columns = ['name', 'type']
+        
+        # Get existing authors
+        existing_authors = pd.read_sql("SELECT id, name FROM authors", con)
+        
+        # Find new authors to insert
+        if not existing_authors.empty:
+            new_authors = unique_authors[~unique_authors['name'].isin(existing_authors['name'])]
+        else:
+            new_authors = unique_authors
+        
+        if not new_authors.empty:
+            logger.info(f"Inserting {len(new_authors)} new authors")
+            new_authors.to_sql('authors', con, if_exists='append', index=False)
+        
+        # Get all authors with their IDs
+        all_authors = pd.read_sql("SELECT id, name FROM authors", con)
+        name_to_id = dict(zip(all_authors['name'], all_authors['id']))
+        
+        # Create post_authors mappings
+        mappings = []
+        for _, row in results_df.iterrows():
+            author_id = name_to_id.get(row['text'])
+            if author_id:
+                mappings.append({
+                    'post_id': row['id'],
+                    'author_id': author_id
+                })
+        
+        if mappings:
+            mappings_df = pd.DataFrame(mappings).drop_duplicates()
+            
+            # Clear existing mappings for these posts (optional, depends on your strategy)
+            # post_ids = tuple(mappings_df['post_id'].unique())
+            # con.execute(f"DELETE FROM post_authors WHERE post_id IN ({','.join('?' * len(post_ids))})", post_ids)
+            
+            logger.info(f"Creating {len(mappings_df)} post-author mappings")
+            mappings_df.to_sql('post_authors', con, if_exists='append', index=False)
+            
+            # Mark posts as cleaned
+            processed_post_ids = mappings_df['post_id'].unique().tolist()
+            if processed_post_ids:
+                placeholders = ','.join('?' * len(processed_post_ids))
+                con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", processed_post_ids)
+                logger.info(f"Marked {len(processed_post_ids)} posts as cleaned")
+        
+        con.commit()
+        logger.info("Authors and mappings stored successfully")
+    
+    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
+        """Execute the author classification transformation.
+        
+        Args:
+            con: SQLite database connection
+            context: TransformContext containing posts dataframe
+            
+        Returns:
+            TransformContext with classified authors dataframe
+        """
+        logger.info("Starting AuthorNode transformation")
+        
+        posts_df = context.get_dataframe()
+        
+        # Ensure required columns exist
+        if 'author' not in posts_df.columns:
+            logger.warning("No 'author' column in dataframe. Skipping AuthorNode.")
+            return context
+        
+        # Create tables
+        self._create_tables(con)
+        
+        # Classify authors
+        results = self._classify_authors(posts_df)
+        
+        # Store results
+        self._store_authors(con, results)
+        
+        # Mark posts without author entities as cleaned too (no authors found)
+        processed_ids = set([r['id'] for r in results]) if results else set()
+        unprocessed_ids = [pid for pid in posts_df['id'].tolist() if pid not in processed_ids]
+        if unprocessed_ids:
+            placeholders = ','.join('?' * len(unprocessed_ids))
+            con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", unprocessed_ids)
+            con.commit()
+            logger.info(f"Marked {len(unprocessed_ids)} posts without author entities as cleaned")
+        
+        # Return context with results
+        results_df = pd.DataFrame(results) if results else pd.DataFrame()
+        logger.info("AuthorNode transformation complete")
+        
+        return TransformContext(results_df)
--- a/transform/base.py
+++ b/transform/base.py
@ -0,0 +1,37 @@
+"""Base transform node for data pipeline."""
+from abc import ABC, abstractmethod
+import sqlite3
+import pandas as pd
+
+
+class TransformContext:
+    """Context object containing the dataframe for transformation."""
+    
+    def __init__(self, df: pd.DataFrame):
+        self.df = df
+    
+    def get_dataframe(self) -> pd.DataFrame:
+        """Get the pandas dataframe from the context."""
+        return self.df
+
+
+class TransformNode(ABC):
+    """Abstract base class for transformation nodes.
+    
+    Each transform node implements a single transformation step
+    that takes data from the database, transforms it, and 
+    potentially writes results back.
+    """
+    
+    @abstractmethod
+    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
+        """Execute the transformation.
+        
+        Args:
+            con: SQLite database connection
+            context: TransformContext containing the input dataframe
+            
+        Returns:
+            TransformContext with the transformed dataframe
+        """
+        pass
--- a/transform/main.py
+++ b/transform/main.py
@ -0,0 +1,89 @@
+#! python3
+import logging
+import os
+import sqlite3
+import sys
+from dotenv import load_dotenv
+
+load_dotenv()
+
+if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
+    logging_level = logging.INFO
+else:
+    logging_level = logging.DEBUG
+
+logging.basicConfig(
+    level=logging_level,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("knack-transform")
+
+
+def setup_database_connection():
+    """Create connection to the SQLite database."""
+    db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
+    logger.info(f"Connecting to database: {db_path}")
+    return sqlite3.connect(db_path)
+
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    """Check if a table exists in the database."""
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def main():
+    """Main entry point for the transform pipeline."""
+    logger.info("Starting transform pipeline")
+    
+    try:
+        con = setup_database_connection()
+        logger.info("Database connection established")
+        
+        # Check if posts table exists
+        if not table_exists('posts', con):
+            logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
+            logger.info("Transform pipeline skipped - no data available")
+            return
+        
+        # Import transform nodes
+        from author_node import AuthorNode
+        from base import TransformContext
+        import pandas as pd
+        
+        # Load posts data
+        logger.info("Loading posts from database")
+        sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
+        MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
+        df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
+        logger.info(f"Loaded {len(df)} uncleaned posts with authors")
+        
+        if df.empty:
+            logger.info("No uncleaned posts found. Transform pipeline skipped.")
+            return
+        
+        # Create context and run author classification
+        context = TransformContext(df)
+        author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu'))  # Change to "cuda" or "mps" if available
+        result_context = author_transform.run(con, context)
+
+        # TODO: Create Node to compute Text Embeddings and UMAP. 
+        # TODO: Create Node to pre-compute data based on visuals to reduce load time.
+        
+        logger.info("Transform pipeline completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Error in transform pipeline: {e}", exc_info=True)
+        sys.exit(1)
+    finally:
+        if 'con' in locals():
+            con.close()
+            logger.info("Database connection closed")
+
+
+if __name__ == "__main__":
+    main()
--- a/transform/requirements.txt
+++ b/transform/requirements.txt
@ -0,0 +1,4 @@
+pandas
+python-dotenv
+gliner
+torch