From d9d0441ddd912515e55098e3dbadb56a2b41ad0b Mon Sep 17 00:00:00 2001
From: quorploop <>
Date: Thu, 29 Jan 2026 22:08:01 +0100
Subject: [PATCH] Adds Node to precalculate jsons for visualisations

---
 transform/pipeline.py      |  16 ++++++
 transform/to_d3_node.py    | 102 +++++++++++++++++++++++++++++++++++++
 visualisation/tojson.ipynb |  94 +++++++++++++++++++++++++++-------
 3 files changed, 194 insertions(+), 18 deletions(-)
 create mode 100644 transform/to_d3_node.py

diff --git a/transform/pipeline.py b/transform/pipeline.py
index 9edcaf3..8572344 100644
--- a/transform/pipeline.py
+++ b/transform/pipeline.py
@@ -215,6 +215,7 @@ def create_default_pipeline(device: str = "cpu",
     from author_node import NerAuthorNode, FuzzyAuthorNode
     from embeddings_node import TextEmbeddingNode, UmapNode
     from url_node import URLNode
+    from to_d3_node import ToD3Node
     
     pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
 
@@ -261,6 +262,21 @@ def create_default_pipeline(device: str = "cpu",
         name='UmapNode'
     ))
 
+    pipeline.add_node(NodeConfig(
+        node_class=ToD3Node,
+        dependencies=[
+            'UmapNode',
+            'TextEmbeddingNode',
+            'FuzzyAuthorNode',
+            'AuthorNode',
+            'URLNode'
+        ],
+        node_kwargs={
+            'output_path': './data/json/'
+        },
+        name='ToD3Node'
+    ))
+
     # TODO: Create Node to compute Text Embeddings and UMAP. 
     
     # pipeline.add_node(NodeConfig(
diff --git a/transform/to_d3_node.py b/transform/to_d3_node.py
new file mode 100644
index 0000000..7f58b1c
--- /dev/null
+++ b/transform/to_d3_node.py
@@ -0,0 +1,102 @@
+"""Node to query data from the database and generate individual json file
+for visualisations in the d3.js framework"""
+import sqlite3
+import logging
+import json
+import os
+
+from pipeline import TransformContext
+from transform_node import TransformNode
+
+logger = logging.getLogger("knack-transform")
+
+class ToD3Node(TransformNode):
+    """Node that takes the data in a sqlite3 database and generates visualisation data
+    as json files in a specific folder.
+    """
+
+    def __init__(self, output_path: str):
+        self.output_path = output_path
+        self.queries = {
+            'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
+            'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
+            'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
+            'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
+            'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
+            'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
+            'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
+        }
+        super().__init__()
+        logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
+
+    def _query_db(self, con: sqlite3.Connection, query: str):
+        cursor = con.cursor()
+        cursor.execute(query)
+        r = [dict((cursor.description[i][0], value) \
+                for i, value in enumerate(row)) for row in cursor.fetchall()]
+        return r
+    
+    def _calculate_files(self, con: sqlite3.Connection): 
+        for key in self.queries.keys():
+            q = self._query_db(con, self.queries[key])
+            with open(f'{self.output_path}{key}.json', 'w') as f:
+                f.write(json.dumps(q))
+
+        return len(self.queries.keys())
+
+    
+    def run(self, con: sqlite3.Connection, context: TransformContext):
+        """Executes the toD3 Node
+        Writes to a bunch of files, each for each query. 
+
+        Args:
+            con (sqlite3.Connection): SQLite database connection
+            context (TransformContext): TransformContext, containing the input
+                dataframe of all post. 
+        
+        Returns:
+            TransformContext with processed dataframe.
+        """
+        logger.info("Starting ToD3Node transformation")
+
+        if not os.path.isdir(self.output_path):
+            logger.warning(f"output_dir does not exist, creating dir...")
+            os.mkdir(self.output_path)
+
+        count = self._calculate_files(con)
+
+        logger.info(f"Successfully generated {count} json files.")
+
+        return context
+
+def main():
+    import sys
+    
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    logger = logging.getLogger("knack-transform")
+    
+    # Connect to database
+    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
+    con = sqlite3.connect(db_path)
+
+    try:
+        context = TransformContext(None)
+
+        node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
+
+        context = node.run(con, context)
+
+    except Exception as e:
+        logger.error(f"Error during transformation: {e}", exc_info=True)
+        raise
+    finally:
+        con.close()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/visualisation/tojson.ipynb b/visualisation/tojson.ipynb
index d38755f..790fbab 100644
--- a/visualisation/tojson.ipynb
+++ b/visualisation/tojson.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "id": "0ab5f064",
    "metadata": {},
    "outputs": [
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "id": "94b2e3d9",
    "metadata": {},
    "outputs": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 14,
    "id": "b3924728",
    "metadata": {},
    "outputs": [],
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 15,
    "id": "c0fdb0ba",
    "metadata": {},
    "outputs": [],
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
    "id": "df5c31b3",
    "metadata": {},
    "outputs": [],
@@ -102,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
    "id": "101b971d",
    "metadata": {},
    "outputs": [],
@@ -124,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 18,
    "id": "2f23046d",
    "metadata": {},
    "outputs": [],
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 19,
    "id": "d4ae65f1",
    "metadata": {
     "vscode": {
@@ -171,7 +171,7 @@
     "    JOIN tags t1 ON t1.id = pt1.tag_id\n",
     "    JOIN tags t2 ON t2.id = pt2.tag_id\n",
     "    GROUP BY t1.tag, t2.tag\n",
-    "    HAVING weight > 3\n",
+    "    HAVING weight > 1\n",
     "    ORDER BY weight DESC;\n",
     "\"\"\")\n",
     "\n",
@@ -181,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
    "id": "13062474",
    "metadata": {
     "vscode": {
@@ -192,24 +192,27 @@
    "source": [
     "q = query_db(\"\"\"\n",
     "select\n",
-    "round(umap_x, 3) as umap_x,\n",
-    "round(umap_y, 3) as umap_y,\n",
-    "round(umap_z, 3) as umap_z,\n",
-    "posts.id, title\n",
+    "cast(umap_x*10 as int) as x,\n",
+    "cast(umap_y*10 as int) as y,\n",
+    "cast(umap_z*10 as int) as z,\n",
+    "posts.id as id, category_id as c,\n",
+    "SUBSTRING(title, 1, 12) as t\n",
     "\n",
     "from posts\n",
     "inner join postcategories on post_id = posts.id\n",
     "inner join categories on category_id = categories.id\n",
-    "where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n",
+    "\n",
     "\"\"\")\n",
     "\n",
+    "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
+    "\n",
     "with open('json/umap_embeddings.json', 'w') as f:\n",
     "    f.write(json.dumps(q))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "e5378b17",
    "metadata": {
     "vscode": {
@@ -239,7 +242,7 @@
     "SELECT \n",
     "    tld AS source, \n",
     "    CASE \n",
-    "        WHEN host_count < 15 THEN 'other'\n",
+    "        WHEN host_count < 10 THEN 'other'\n",
     "        ELSE host \n",
     "    END AS target, \n",
     "    SUM(host_count) AS value\n",
@@ -249,7 +252,7 @@
     "    WHERE tld IS NOT NULL AND host IS NOT NULL \n",
     "    GROUP BY tld, host\n",
     ")\n",
-    "WHERE source != \"\"\n",
+    "WHERE source != \"\" AND target != 'other'\n",
     "GROUP BY tld, target\n",
     "\"\"\")\n",
     "\n",
@@ -259,6 +262,61 @@
     "with open('json/urls_l2.json', 'w') as f:\n",
     "    f.write(json.dumps(q2))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "1501cb06",
+   "metadata": {
+    "vscode": {
+     "languageId": "ruby"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
+       " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
+       " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
+       " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
+       " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
+       " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
+       " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
+       " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
+       " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
+       " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
+       " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "q = query_db(\"\"\"\n",
+    "    SELECT \n",
+    "    a.name AS author_name,\n",
+    "    t.tag,\n",
+    "    COUNT(*) AS tag_count\n",
+    "FROM authors a\n",
+    "JOIN post_authors pa ON a.id = pa.author_id\n",
+    "JOIN posttags pt ON pa.post_id = pt.post_id\n",
+    "JOIN tags t ON pt.tag_id = t.id\n",
+    "WHERE a.name = 'Antifa'\n",
+    "GROUP BY a.id, a.name, t.id, t.tag\n",
+    "ORDER BY tag_count DESC;\n",
+    "\"\"\")\n",
+    "\n",
+    "q"
+   ]
   }
  ],
  "metadata": {