forked from lukaszett/Knack-Scraper
Adds Node to precalculate jsons for visualisations
This commit is contained in:
parent
7c2e34906e
commit
d9d0441ddd
3 changed files with 194 additions and 18 deletions
|
|
@ -215,6 +215,7 @@ def create_default_pipeline(device: str = "cpu",
|
||||||
from author_node import NerAuthorNode, FuzzyAuthorNode
|
from author_node import NerAuthorNode, FuzzyAuthorNode
|
||||||
from embeddings_node import TextEmbeddingNode, UmapNode
|
from embeddings_node import TextEmbeddingNode, UmapNode
|
||||||
from url_node import URLNode
|
from url_node import URLNode
|
||||||
|
from to_d3_node import ToD3Node
|
||||||
|
|
||||||
pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
|
pipeline = ParallelPipeline(max_workers=max_workers, use_processes=False)
|
||||||
|
|
||||||
|
|
@ -261,6 +262,21 @@ def create_default_pipeline(device: str = "cpu",
|
||||||
name='UmapNode'
|
name='UmapNode'
|
||||||
))
|
))
|
||||||
|
|
||||||
|
pipeline.add_node(NodeConfig(
|
||||||
|
node_class=ToD3Node,
|
||||||
|
dependencies=[
|
||||||
|
'UmapNode',
|
||||||
|
'TextEmbeddingNode',
|
||||||
|
'FuzzyAuthorNode',
|
||||||
|
'AuthorNode',
|
||||||
|
'URLNode'
|
||||||
|
],
|
||||||
|
node_kwargs={
|
||||||
|
'output_path': './data/json/'
|
||||||
|
},
|
||||||
|
name='ToD3Node'
|
||||||
|
))
|
||||||
|
|
||||||
# TODO: Create Node to compute Text Embeddings and UMAP.
|
# TODO: Create Node to compute Text Embeddings and UMAP.
|
||||||
|
|
||||||
# pipeline.add_node(NodeConfig(
|
# pipeline.add_node(NodeConfig(
|
||||||
|
|
|
||||||
102
transform/to_d3_node.py
Normal file
102
transform/to_d3_node.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
"""Node to query data from the database and generate individual json file
|
||||||
|
for visualisations in the d3.js framework"""
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pipeline import TransformContext
|
||||||
|
from transform_node import TransformNode
|
||||||
|
|
||||||
|
logger = logging.getLogger("knack-transform")
|
||||||
|
|
||||||
|
class ToD3Node(TransformNode):
|
||||||
|
"""Node that takes the data in a sqlite3 database and generates visualisation data
|
||||||
|
as json files in a specific folder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, output_path: str):
|
||||||
|
self.output_path = output_path
|
||||||
|
self.queries = {
|
||||||
|
'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
|
||||||
|
'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
|
||||||
|
'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
|
||||||
|
'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
|
||||||
|
'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
|
||||||
|
'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
|
||||||
|
'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
|
||||||
|
}
|
||||||
|
super().__init__()
|
||||||
|
logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
|
||||||
|
|
||||||
|
def _query_db(self, con: sqlite3.Connection, query: str):
|
||||||
|
cursor = con.cursor()
|
||||||
|
cursor.execute(query)
|
||||||
|
r = [dict((cursor.description[i][0], value) \
|
||||||
|
for i, value in enumerate(row)) for row in cursor.fetchall()]
|
||||||
|
return r
|
||||||
|
|
||||||
|
def _calculate_files(self, con: sqlite3.Connection):
|
||||||
|
for key in self.queries.keys():
|
||||||
|
q = self._query_db(con, self.queries[key])
|
||||||
|
with open(f'{self.output_path}{key}.json', 'w') as f:
|
||||||
|
f.write(json.dumps(q))
|
||||||
|
|
||||||
|
return len(self.queries.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def run(self, con: sqlite3.Connection, context: TransformContext):
|
||||||
|
"""Executes the toD3 Node
|
||||||
|
Writes to a bunch of files, each for each query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
con (sqlite3.Connection): SQLite database connection
|
||||||
|
context (TransformContext): TransformContext, containing the input
|
||||||
|
dataframe of all post.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TransformContext with processed dataframe.
|
||||||
|
"""
|
||||||
|
logger.info("Starting ToD3Node transformation")
|
||||||
|
|
||||||
|
if not os.path.isdir(self.output_path):
|
||||||
|
logger.warning(f"output_dir does not exist, creating dir...")
|
||||||
|
os.mkdir(self.output_path)
|
||||||
|
|
||||||
|
count = self._calculate_files(con)
|
||||||
|
|
||||||
|
logger.info(f"Successfully generated {count} json files.")
|
||||||
|
|
||||||
|
return context
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("knack-transform")
|
||||||
|
|
||||||
|
# Connect to database
|
||||||
|
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||||
|
con = sqlite3.connect(db_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
context = TransformContext(None)
|
||||||
|
|
||||||
|
node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
|
||||||
|
|
||||||
|
context = node.run(con, context)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 12,
|
||||||
"id": "0ab5f064",
|
"id": "0ab5f064",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 13,
|
||||||
"id": "94b2e3d9",
|
"id": "94b2e3d9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -62,7 +62,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 14,
|
||||||
"id": "b3924728",
|
"id": "b3924728",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -76,7 +76,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 15,
|
||||||
"id": "c0fdb0ba",
|
"id": "c0fdb0ba",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -89,7 +89,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 16,
|
||||||
"id": "df5c31b3",
|
"id": "df5c31b3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -102,7 +102,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 17,
|
||||||
"id": "101b971d",
|
"id": "101b971d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -124,7 +124,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 18,
|
||||||
"id": "2f23046d",
|
"id": "2f23046d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -151,7 +151,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 19,
|
||||||
"id": "d4ae65f1",
|
"id": "d4ae65f1",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"vscode": {
|
"vscode": {
|
||||||
|
|
@ -171,7 +171,7 @@
|
||||||
" JOIN tags t1 ON t1.id = pt1.tag_id\n",
|
" JOIN tags t1 ON t1.id = pt1.tag_id\n",
|
||||||
" JOIN tags t2 ON t2.id = pt2.tag_id\n",
|
" JOIN tags t2 ON t2.id = pt2.tag_id\n",
|
||||||
" GROUP BY t1.tag, t2.tag\n",
|
" GROUP BY t1.tag, t2.tag\n",
|
||||||
" HAVING weight > 3\n",
|
" HAVING weight > 1\n",
|
||||||
" ORDER BY weight DESC;\n",
|
" ORDER BY weight DESC;\n",
|
||||||
"\"\"\")\n",
|
"\"\"\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -181,7 +181,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 20,
|
||||||
"id": "13062474",
|
"id": "13062474",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"vscode": {
|
"vscode": {
|
||||||
|
|
@ -192,24 +192,27 @@
|
||||||
"source": [
|
"source": [
|
||||||
"q = query_db(\"\"\"\n",
|
"q = query_db(\"\"\"\n",
|
||||||
"select\n",
|
"select\n",
|
||||||
"round(umap_x, 3) as umap_x,\n",
|
"cast(umap_x*10 as int) as x,\n",
|
||||||
"round(umap_y, 3) as umap_y,\n",
|
"cast(umap_y*10 as int) as y,\n",
|
||||||
"round(umap_z, 3) as umap_z,\n",
|
"cast(umap_z*10 as int) as z,\n",
|
||||||
"posts.id, title\n",
|
"posts.id as id, category_id as c,\n",
|
||||||
|
"SUBSTRING(title, 1, 12) as t\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from posts\n",
|
"from posts\n",
|
||||||
"inner join postcategories on post_id = posts.id\n",
|
"inner join postcategories on post_id = posts.id\n",
|
||||||
"inner join categories on category_id = categories.id\n",
|
"inner join categories on category_id = categories.id\n",
|
||||||
"where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n",
|
"\n",
|
||||||
"\"\"\")\n",
|
"\"\"\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
|
||||||
|
"\n",
|
||||||
"with open('json/umap_embeddings.json', 'w') as f:\n",
|
"with open('json/umap_embeddings.json', 'w') as f:\n",
|
||||||
" f.write(json.dumps(q))"
|
" f.write(json.dumps(q))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 21,
|
||||||
"id": "e5378b17",
|
"id": "e5378b17",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"vscode": {
|
"vscode": {
|
||||||
|
|
@ -239,7 +242,7 @@
|
||||||
"SELECT \n",
|
"SELECT \n",
|
||||||
" tld AS source, \n",
|
" tld AS source, \n",
|
||||||
" CASE \n",
|
" CASE \n",
|
||||||
" WHEN host_count < 15 THEN 'other'\n",
|
" WHEN host_count < 10 THEN 'other'\n",
|
||||||
" ELSE host \n",
|
" ELSE host \n",
|
||||||
" END AS target, \n",
|
" END AS target, \n",
|
||||||
" SUM(host_count) AS value\n",
|
" SUM(host_count) AS value\n",
|
||||||
|
|
@ -249,7 +252,7 @@
|
||||||
" WHERE tld IS NOT NULL AND host IS NOT NULL \n",
|
" WHERE tld IS NOT NULL AND host IS NOT NULL \n",
|
||||||
" GROUP BY tld, host\n",
|
" GROUP BY tld, host\n",
|
||||||
")\n",
|
")\n",
|
||||||
"WHERE source != \"\"\n",
|
"WHERE source != \"\" AND target != 'other'\n",
|
||||||
"GROUP BY tld, target\n",
|
"GROUP BY tld, target\n",
|
||||||
"\"\"\")\n",
|
"\"\"\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -259,6 +262,61 @@
|
||||||
"with open('json/urls_l2.json', 'w') as f:\n",
|
"with open('json/urls_l2.json', 'w') as f:\n",
|
||||||
" f.write(json.dumps(q2))"
|
" f.write(json.dumps(q2))"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "1501cb06",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "ruby"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
|
||||||
|
" {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"q = query_db(\"\"\"\n",
|
||||||
|
" SELECT \n",
|
||||||
|
" a.name AS author_name,\n",
|
||||||
|
" t.tag,\n",
|
||||||
|
" COUNT(*) AS tag_count\n",
|
||||||
|
"FROM authors a\n",
|
||||||
|
"JOIN post_authors pa ON a.id = pa.author_id\n",
|
||||||
|
"JOIN posttags pt ON pa.post_id = pt.post_id\n",
|
||||||
|
"JOIN tags t ON pt.tag_id = t.id\n",
|
||||||
|
"WHERE a.name = 'Antifa'\n",
|
||||||
|
"GROUP BY a.id, a.name, t.id, t.tag\n",
|
||||||
|
"ORDER BY tag_count DESC;\n",
|
||||||
|
"\"\"\")\n",
|
||||||
|
"\n",
|
||||||
|
"q"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue