Adds Node to precalculate jsons for visualisations

2026-01-29 22:08:01 +01:00 · 2026-01-29 22:08:01 +01:00 · d9d0441ddd
commit d9d0441ddd
parent 7c2e34906e
3 changed files with 194 additions and 18 deletions
--- a/transform/pipeline.py
+++ b/transform/pipeline.py
@ -215,6 +215,7 @@ def create_default_pipeline(device: str = "cpu",
    from author_node import NerAuthorNode, FuzzyAuthorNode
    from embeddings_node import TextEmbeddingNode, UmapNode
    from url_node import URLNode
    from to_d3_node import ToD3Node
    pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
@ -261,6 +262,21 @@ def create_default_pipeline(device: str = "cpu",
        name='UmapNode'
    ))
    pipeline.add_node(NodeConfig(
        node_class=ToD3Node,
        dependencies=[
            'UmapNode',
            'TextEmbeddingNode',
            'FuzzyAuthorNode',
            'AuthorNode',
            'URLNode'
        ],
        node_kwargs={
            'output_path': './data/json/'
        },
        name='ToD3Node'
    ))
    # TODO: Create Node to compute Text Embeddings and UMAP. 
    # pipeline.add_node(NodeConfig(
--- a/transform/to_d3_node.py
+++ b/transform/to_d3_node.py
@ -0,0 +1,102 @@
 """Node to query data from the database and generate individual json file
 for visualisations in the d3.js framework"""
 import sqlite3
 import logging
 import json
 import os
 from pipeline import TransformContext
 from transform_node import TransformNode
 logger = logging.getLogger("knack-transform")
 class ToD3Node(TransformNode):
    """Node that takes the data in a sqlite3 database and generates visualisation data
    as json files in a specific folder.
    """
    def __init__(self, output_path: str):
        self.output_path = output_path
        self.queries = {
            'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
            'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
            'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
            'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
            'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
            'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
            'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
        }
        super().__init__()
        logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
    def _query_db(self, con: sqlite3.Connection, query: str):
        cursor = con.cursor()
        cursor.execute(query)
        r = [dict((cursor.description[i][0], value) \
                for i, value in enumerate(row)) for row in cursor.fetchall()]
        return r
    def _calculate_files(self, con: sqlite3.Connection): 
        for key in self.queries.keys():
            q = self._query_db(con, self.queries[key])
            with open(f'{self.output_path}{key}.json', 'w') as f:
                f.write(json.dumps(q))
        return len(self.queries.keys())
    def run(self, con: sqlite3.Connection, context: TransformContext):
        """Executes the toD3 Node
        Writes to a bunch of files, each for each query. 
        Args:
            con (sqlite3.Connection): SQLite database connection
            context (TransformContext): TransformContext, containing the input
                dataframe of all post. 
        Returns:
            TransformContext with processed dataframe.
        """
        logger.info("Starting ToD3Node transformation")
        if not os.path.isdir(self.output_path):
            logger.warning(f"output_dir does not exist, creating dir...")
            os.mkdir(self.output_path)
        count = self._calculate_files(con)
        logger.info(f"Successfully generated {count} json files.")
        return context
 def main():
    import sys
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")
    # Connect to database
    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
    con = sqlite3.connect(db_path)
    try:
        context = TransformContext(None)
        node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
        context = node.run(con, context)
    except Exception as e:
        logger.error(f"Error during transformation: {e}", exc_info=True)
        raise
    finally:
        con.close()
 if __name__ == '__main__':
    main()
--- a/visualisation/tojson.ipynb
+++ b/visualisation/tojson.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
   "id": "0ab5f064",
   "metadata": {},
   "outputs": [
@ -24,7 +24,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
   "id": "94b2e3d9",
   "metadata": {},
   "outputs": [
@ -62,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 14,
   "id": "b3924728",
   "metadata": {},
   "outputs": [],
@ -76,7 +76,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 15,
   "id": "c0fdb0ba",
   "metadata": {},
   "outputs": [],
@ -89,7 +89,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
   "id": "df5c31b3",
   "metadata": {},
   "outputs": [],
@ -102,7 +102,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
   "id": "101b971d",
   "metadata": {},
   "outputs": [],
@ -124,7 +124,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 18,
   "id": "2f23046d",
   "metadata": {},
   "outputs": [],
@ -151,7 +151,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 19,
   "id": "d4ae65f1",
   "metadata": {
    "vscode": {
@ -171,7 +171,7 @@
    "    JOIN tags t1 ON t1.id = pt1.tag_id\n",
    "    JOIN tags t2 ON t2.id = pt2.tag_id\n",
    "    GROUP BY t1.tag, t2.tag\n",
-    "    HAVING weight > 3\n",
+    "    HAVING weight > 1\n",
    "    ORDER BY weight DESC;\n",
    "\"\"\")\n",
    "\n",
@ -181,7 +181,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
   "id": "13062474",
   "metadata": {
    "vscode": {
@ -192,24 +192,27 @@
   "source": [
    "q = query_db(\"\"\"\n",
    "select\n",
-    "round(umap_x, 3) as umap_x,\n",
+    "cast(umap_x*10 as int) as x,\n",
-    "round(umap_y, 3) as umap_y,\n",
+    "cast(umap_y*10 as int) as y,\n",
-    "round(umap_z, 3) as umap_z,\n",
+    "cast(umap_z*10 as int) as z,\n",
-    "posts.id, title\n",
+    "posts.id as id, category_id as c,\n",
    "SUBSTRING(title, 1, 12) as t\n",
    "\n",
    "from posts\n",
    "inner join postcategories on post_id = posts.id\n",
    "inner join categories on category_id = categories.id\n",
-    "where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n",
+    "\n",
    "\"\"\")\n",
    "\n",
    "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
    "\n",
    "with open('json/umap_embeddings.json', 'w') as f:\n",
    "    f.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "id": "e5378b17",
   "metadata": {
    "vscode": {
@ -239,7 +242,7 @@
    "SELECT \n",
    "    tld AS source, \n",
    "    CASE \n",
-    "        WHEN host_count < 15 THEN 'other'\n",
+    "        WHEN host_count < 10 THEN 'other'\n",
    "        ELSE host \n",
    "    END AS target, \n",
    "    SUM(host_count) AS value\n",
@ -249,7 +252,7 @@
    "    WHERE tld IS NOT NULL AND host IS NOT NULL \n",
    "    GROUP BY tld, host\n",
    ")\n",
-    "WHERE source != \"\"\n",
+    "WHERE source != \"\" AND target != 'other'\n",
    "GROUP BY tld, target\n",
    "\"\"\")\n",
    "\n",
@ -259,6 +262,61 @@
    "with open('json/urls_l2.json', 'w') as f:\n",
    "    f.write(json.dumps(q2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1501cb06",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
       " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
       " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
       " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
       " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
       " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
       " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q = query_db(\"\"\"\n",
    "    SELECT \n",
    "    a.name AS author_name,\n",
    "    t.tag,\n",
    "    COUNT(*) AS tag_count\n",
    "FROM authors a\n",
    "JOIN post_authors pa ON a.id = pa.author_id\n",
    "JOIN posttags pt ON pa.post_id = pt.post_id\n",
    "JOIN tags t ON pt.tag_id = t.id\n",
    "WHERE a.name = 'Antifa'\n",
    "GROUP BY a.id, a.name, t.id, t.tag\n",
    "ORDER BY tag_count DESC;\n",
    "\"\"\")\n",
    "\n",
    "q"
   ]
  }
 ],
 "metadata": {