Adds Node to precalculate jsons for visualisations

This commit is contained in:
quorploop 2026-01-29 22:08:01 +01:00
parent 7c2e34906e
commit d9d0441ddd
3 changed files with 194 additions and 18 deletions

View file

@ -215,6 +215,7 @@ def create_default_pipeline(device: str = "cpu",
from author_node import NerAuthorNode, FuzzyAuthorNode from author_node import NerAuthorNode, FuzzyAuthorNode
from embeddings_node import TextEmbeddingNode, UmapNode from embeddings_node import TextEmbeddingNode, UmapNode
from url_node import URLNode from url_node import URLNode
from to_d3_node import ToD3Node
pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False) pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
@ -261,6 +262,21 @@ def create_default_pipeline(device: str = "cpu",
name='UmapNode' name='UmapNode'
)) ))
pipeline.add_node(NodeConfig(
node_class=ToD3Node,
dependencies=[
'UmapNode',
'TextEmbeddingNode',
'FuzzyAuthorNode',
'AuthorNode',
'URLNode'
],
node_kwargs={
'output_path': './data/json/'
},
name='ToD3Node'
))
# TODO: Create Node to compute Text Embeddings and UMAP. # TODO: Create Node to compute Text Embeddings and UMAP.
# pipeline.add_node(NodeConfig( # pipeline.add_node(NodeConfig(

102
transform/to_d3_node.py Normal file
View file

@ -0,0 +1,102 @@
"""Node to query data from the database and generate individual json file
for visualisations in the d3.js framework"""
import sqlite3
import logging
import json
import os
from pipeline import TransformContext
from transform_node import TransformNode
logger = logging.getLogger("knack-transform")
class ToD3Node(TransformNode):
"""Node that takes the data in a sqlite3 database and generates visualisation data
as json files in a specific folder.
"""
def __init__(self, output_path: str):
self.output_path = output_path
self.queries = {
'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
}
super().__init__()
logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
def _query_db(self, con: sqlite3.Connection, query: str):
cursor = con.cursor()
cursor.execute(query)
r = [dict((cursor.description[i][0], value) \
for i, value in enumerate(row)) for row in cursor.fetchall()]
return r
def _calculate_files(self, con: sqlite3.Connection):
for key in self.queries.keys():
q = self._query_db(con, self.queries[key])
with open(f'{self.output_path}{key}.json', 'w') as f:
f.write(json.dumps(q))
return len(self.queries.keys())
def run(self, con: sqlite3.Connection, context: TransformContext):
"""Executes the toD3 Node
Writes to a bunch of files, each for each query.
Args:
con (sqlite3.Connection): SQLite database connection
context (TransformContext): TransformContext, containing the input
dataframe of all post.
Returns:
TransformContext with processed dataframe.
"""
logger.info("Starting ToD3Node transformation")
if not os.path.isdir(self.output_path):
logger.warning(f"output_dir does not exist, creating dir...")
os.mkdir(self.output_path)
count = self._calculate_files(con)
logger.info(f"Successfully generated {count} json files.")
return context
def main():
import sys
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
# Connect to database
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
con = sqlite3.connect(db_path)
try:
context = TransformContext(None)
node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
context = node.run(con, context)
except Exception as e:
logger.error(f"Error during transformation: {e}", exc_info=True)
raise
finally:
con.close()
if __name__ == '__main__':
main()

View file

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 12,
"id": "0ab5f064", "id": "0ab5f064",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -24,7 +24,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 13,
"id": "94b2e3d9", "id": "94b2e3d9",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -62,7 +62,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 14,
"id": "b3924728", "id": "b3924728",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -76,7 +76,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 15,
"id": "c0fdb0ba", "id": "c0fdb0ba",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -89,7 +89,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 16,
"id": "df5c31b3", "id": "df5c31b3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -102,7 +102,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 17,
"id": "101b971d", "id": "101b971d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -124,7 +124,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 18,
"id": "2f23046d", "id": "2f23046d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -151,7 +151,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 19,
"id": "d4ae65f1", "id": "d4ae65f1",
"metadata": { "metadata": {
"vscode": { "vscode": {
@ -171,7 +171,7 @@
" JOIN tags t1 ON t1.id = pt1.tag_id\n", " JOIN tags t1 ON t1.id = pt1.tag_id\n",
" JOIN tags t2 ON t2.id = pt2.tag_id\n", " JOIN tags t2 ON t2.id = pt2.tag_id\n",
" GROUP BY t1.tag, t2.tag\n", " GROUP BY t1.tag, t2.tag\n",
" HAVING weight > 3\n", " HAVING weight > 1\n",
" ORDER BY weight DESC;\n", " ORDER BY weight DESC;\n",
"\"\"\")\n", "\"\"\")\n",
"\n", "\n",
@ -181,7 +181,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 20,
"id": "13062474", "id": "13062474",
"metadata": { "metadata": {
"vscode": { "vscode": {
@ -192,24 +192,27 @@
"source": [ "source": [
"q = query_db(\"\"\"\n", "q = query_db(\"\"\"\n",
"select\n", "select\n",
"round(umap_x, 3) as umap_x,\n", "cast(umap_x*10 as int) as x,\n",
"round(umap_y, 3) as umap_y,\n", "cast(umap_y*10 as int) as y,\n",
"round(umap_z, 3) as umap_z,\n", "cast(umap_z*10 as int) as z,\n",
"posts.id, title\n", "posts.id as id, category_id as c,\n",
"SUBSTRING(title, 1, 12) as t\n",
"\n", "\n",
"from posts\n", "from posts\n",
"inner join postcategories on post_id = posts.id\n", "inner join postcategories on post_id = posts.id\n",
"inner join categories on category_id = categories.id\n", "inner join categories on category_id = categories.id\n",
"where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n", "\n",
"\"\"\")\n", "\"\"\")\n",
"\n", "\n",
"#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
"\n",
"with open('json/umap_embeddings.json', 'w') as f:\n", "with open('json/umap_embeddings.json', 'w') as f:\n",
" f.write(json.dumps(q))" " f.write(json.dumps(q))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 21,
"id": "e5378b17", "id": "e5378b17",
"metadata": { "metadata": {
"vscode": { "vscode": {
@ -239,7 +242,7 @@
"SELECT \n", "SELECT \n",
" tld AS source, \n", " tld AS source, \n",
" CASE \n", " CASE \n",
" WHEN host_count < 15 THEN 'other'\n", " WHEN host_count < 10 THEN 'other'\n",
" ELSE host \n", " ELSE host \n",
" END AS target, \n", " END AS target, \n",
" SUM(host_count) AS value\n", " SUM(host_count) AS value\n",
@ -249,7 +252,7 @@
" WHERE tld IS NOT NULL AND host IS NOT NULL \n", " WHERE tld IS NOT NULL AND host IS NOT NULL \n",
" GROUP BY tld, host\n", " GROUP BY tld, host\n",
")\n", ")\n",
"WHERE source != \"\"\n", "WHERE source != \"\" AND target != 'other'\n",
"GROUP BY tld, target\n", "GROUP BY tld, target\n",
"\"\"\")\n", "\"\"\")\n",
"\n", "\n",
@ -259,6 +262,61 @@
"with open('json/urls_l2.json', 'w') as f:\n", "with open('json/urls_l2.json', 'w') as f:\n",
" f.write(json.dumps(q2))" " f.write(json.dumps(q2))"
] ]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "1501cb06",
"metadata": {
"vscode": {
"languageId": "ruby"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
" {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
" {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
" {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
" {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
" {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
" {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
" {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
" {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
" {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
" {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = query_db(\"\"\"\n",
" SELECT \n",
" a.name AS author_name,\n",
" t.tag,\n",
" COUNT(*) AS tag_count\n",
"FROM authors a\n",
"JOIN post_authors pa ON a.id = pa.author_id\n",
"JOIN posttags pt ON pa.post_id = pt.post_id\n",
"JOIN tags t ON pt.tag_id = t.id\n",
"WHERE a.name = 'Antifa'\n",
"GROUP BY a.id, a.name, t.id, t.tag\n",
"ORDER BY tag_count DESC;\n",
"\"\"\")\n",
"\n",
"q"
]
} }
], ],
"metadata": { "metadata": {