From d9d0441ddd912515e55098e3dbadb56a2b41ad0b Mon Sep 17 00:00:00 2001 From: quorploop <> Date: Thu, 29 Jan 2026 22:08:01 +0100 Subject: [PATCH] Adds Node to precalculate jsons for visualisations --- transform/pipeline.py | 16 ++++++ transform/to_d3_node.py | 102 +++++++++++++++++++++++++++++++++++++ visualisation/tojson.ipynb | 94 +++++++++++++++++++++++++++------- 3 files changed, 194 insertions(+), 18 deletions(-) create mode 100644 transform/to_d3_node.py diff --git a/transform/pipeline.py b/transform/pipeline.py index 9edcaf3..8572344 100644 --- a/transform/pipeline.py +++ b/transform/pipeline.py @@ -215,6 +215,7 @@ def create_default_pipeline(device: str = "cpu", from author_node import NerAuthorNode, FuzzyAuthorNode from embeddings_node import TextEmbeddingNode, UmapNode from url_node import URLNode + from to_d3_node import ToD3Node pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False) @@ -261,6 +262,21 @@ def create_default_pipeline(device: str = "cpu", name='UmapNode' )) + pipeline.add_node(NodeConfig( + node_class=ToD3Node, + dependencies=[ + 'UmapNode', + 'TextEmbeddingNode', + 'FuzzyAuthorNode', + 'AuthorNode', + 'URLNode' + ], + node_kwargs={ + 'output_path': './data/json/' + }, + name='ToD3Node' + )) + # TODO: Create Node to compute Text Embeddings and UMAP. # pipeline.add_node(NodeConfig( diff --git a/transform/to_d3_node.py b/transform/to_d3_node.py new file mode 100644 index 0000000..7f58b1c --- /dev/null +++ b/transform/to_d3_node.py @@ -0,0 +1,102 @@ +"""Node to query data from the database and generate individual json file +for visualisations in the d3.js framework""" +import sqlite3 +import logging +import json +import os + +from pipeline import TransformContext +from transform_node import TransformNode + +logger = logging.getLogger("knack-transform") + +class ToD3Node(TransformNode): + """Node that takes the data in a sqlite3 database and generates visualisation data + as json files in a specific folder. + """ + + def __init__(self, output_path: str): + self.output_path = output_path + self.queries = { + 'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;', + 'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;", + 'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;", + 'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;", + 'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;', + 'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;", + 'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target" + } + super().__init__() + logger.info(f"Init ToD3Node, Storing files to {self.output_path}") + + def _query_db(self, con: sqlite3.Connection, query: str): + cursor = con.cursor() + cursor.execute(query) + r = [dict((cursor.description[i][0], value) \ + for i, value in enumerate(row)) for row in cursor.fetchall()] + return r + + def _calculate_files(self, con: sqlite3.Connection): + for key in self.queries.keys(): + q = self._query_db(con, self.queries[key]) + with open(f'{self.output_path}{key}.json', 'w') as f: + f.write(json.dumps(q)) + + return len(self.queries.keys()) + + + def run(self, con: sqlite3.Connection, context: TransformContext): + """Executes the toD3 Node + Writes to a bunch of files, each for each query. + + Args: + con (sqlite3.Connection): SQLite database connection + context (TransformContext): TransformContext, containing the input + dataframe of all post. + + Returns: + TransformContext with processed dataframe. + """ + logger.info("Starting ToD3Node transformation") + + if not os.path.isdir(self.output_path): + logger.warning(f"output_dir does not exist, creating dir...") + os.mkdir(self.output_path) + + count = self._calculate_files(con) + + logger.info(f"Successfully generated {count} json files.") + + return context + +def main(): + import sys + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] + ) + logger = logging.getLogger("knack-transform") + + # Connect to database + db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite" + con = sqlite3.connect(db_path) + + try: + context = TransformContext(None) + + node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/') + + context = node.run(con, context) + + except Exception as e: + logger.error(f"Error during transformation: {e}", exc_info=True) + raise + finally: + con.close() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/visualisation/tojson.ipynb b/visualisation/tojson.ipynb index d38755f..790fbab 100644 --- a/visualisation/tojson.ipynb +++ b/visualisation/tojson.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "id": "0ab5f064", "metadata": {}, "outputs": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "id": "94b2e3d9", "metadata": {}, "outputs": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "id": "b3924728", "metadata": {}, "outputs": [], @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "id": "c0fdb0ba", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "id": "df5c31b3", "metadata": {}, "outputs": [], @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "id": "101b971d", "metadata": {}, "outputs": [], @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "id": "2f23046d", "metadata": {}, "outputs": [], @@ -151,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 19, "id": "d4ae65f1", "metadata": { "vscode": { @@ -171,7 +171,7 @@ " JOIN tags t1 ON t1.id = pt1.tag_id\n", " JOIN tags t2 ON t2.id = pt2.tag_id\n", " GROUP BY t1.tag, t2.tag\n", - " HAVING weight > 3\n", + " HAVING weight > 1\n", " ORDER BY weight DESC;\n", "\"\"\")\n", "\n", @@ -181,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 20, "id": "13062474", "metadata": { "vscode": { @@ -192,24 +192,27 @@ "source": [ "q = query_db(\"\"\"\n", "select\n", - "round(umap_x, 3) as umap_x,\n", - "round(umap_y, 3) as umap_y,\n", - "round(umap_z, 3) as umap_z,\n", - "posts.id, title\n", + "cast(umap_x*10 as int) as x,\n", + "cast(umap_y*10 as int) as y,\n", + "cast(umap_z*10 as int) as z,\n", + "posts.id as id, category_id as c,\n", + "SUBSTRING(title, 1, 12) as t\n", "\n", "from posts\n", "inner join postcategories on post_id = posts.id\n", "inner join categories on category_id = categories.id\n", - "where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n", + "\n", "\"\"\")\n", "\n", + "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n", + "\n", "with open('json/umap_embeddings.json', 'w') as f:\n", " f.write(json.dumps(q))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "e5378b17", "metadata": { "vscode": { @@ -239,7 +242,7 @@ "SELECT \n", " tld AS source, \n", " CASE \n", - " WHEN host_count < 15 THEN 'other'\n", + " WHEN host_count < 10 THEN 'other'\n", " ELSE host \n", " END AS target, \n", " SUM(host_count) AS value\n", @@ -249,7 +252,7 @@ " WHERE tld IS NOT NULL AND host IS NOT NULL \n", " GROUP BY tld, host\n", ")\n", - "WHERE source != \"\"\n", + "WHERE source != \"\" AND target != 'other'\n", "GROUP BY tld, target\n", "\"\"\")\n", "\n", @@ -259,6 +262,61 @@ "with open('json/urls_l2.json', 'w') as f:\n", " f.write(json.dumps(q2))" ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1501cb06", + "metadata": { + "vscode": { + "languageId": "ruby" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n", + " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n", + " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n", + " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n", + " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n", + " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n", + " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n", + " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n", + " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n", + " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n", + " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = query_db(\"\"\"\n", + " SELECT \n", + " a.name AS author_name,\n", + " t.tag,\n", + " COUNT(*) AS tag_count\n", + "FROM authors a\n", + "JOIN post_authors pa ON a.id = pa.author_id\n", + "JOIN posttags pt ON pa.post_id = pt.post_id\n", + "JOIN tags t ON pt.tag_id = t.id\n", + "WHERE a.name = 'Antifa'\n", + "GROUP BY a.id, a.name, t.id, t.tag\n", + "ORDER BY tag_count DESC;\n", + "\"\"\")\n", + "\n", + "q" + ] } ], "metadata": {