Knack-Scraper/transform/to_d3_node.py

102 lines
No EOL
4.4 KiB
Python

"""Node to query data from the database and generate individual json file
for visualisations in the d3.js framework"""
import sqlite3
import logging
import json
import os
from pipeline import TransformContext
from transform_node import TransformNode
logger = logging.getLogger("knack-transform")
class ToD3Node(TransformNode):
"""Node that takes the data in a sqlite3 database and generates visualisation data
as json files in a specific folder.
"""
def __init__(self, output_path: str):
self.output_path = output_path
self.queries = {
'authors': 'select name, min(type) as type, count(posts.id) as count from authors inner join post_authors on authors.id = author_id inner join posts on posts.id = post_id group by name order by count desc limit 25;',
'categories': "select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;",
'posts_per_month': "SELECT strftime('%Y-%m', date) AS month, category, COUNT(*) AS count FROM posts WHERE date > '2020-01-01' AND category NOT NULL GROUP BY strftime('%Y-%m', date), category ORDER BY month;",
'tag_chords': "SELECT t1.tag AS source, t2.tag AS target, COUNT(*) AS weight FROM posttags pt1 JOIN posttags pt2 ON pt1.post_id = pt2.post_id AND pt1.tag_id < pt2.tag_id JOIN tags t1 ON t1.id = pt1.tag_id JOIN tags t2 ON t2.id = pt2.tag_id GROUP BY t1.tag, t2.tag HAVING weight > 1 ORDER BY weight DESC;",
'tags': 'select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35;',
'urls_l1': "SELECT 'knack[punkt]news' AS source, CASE WHEN tld_count < 10 THEN 'other' ELSE tld END AS target, SUM(tld_count) AS value FROM (SELECT tld, COUNT(*) as tld_count FROM urls WHERE tld IS NOT NULL GROUP BY tld ) GROUP BY target;",
'urls_l2': "SELECT tld AS source, CASE WHEN host_count < 10 THEN 'other' ELSE host END AS target, SUM(host_count) AS value FROM (SELECT tld, host, COUNT(*) as host_count FROM urls WHERE tld IS NOT NULL AND host IS NOT NULL GROUP BY tld, host) WHERE source != '' AND target != 'other' GROUP BY tld, target"
}
super().__init__()
logger.info(f"Init ToD3Node, Storing files to {self.output_path}")
def _query_db(self, con: sqlite3.Connection, query: str):
cursor = con.cursor()
cursor.execute(query)
r = [dict((cursor.description[i][0], value) \
for i, value in enumerate(row)) for row in cursor.fetchall()]
return r
def _calculate_files(self, con: sqlite3.Connection):
for key in self.queries.keys():
q = self._query_db(con, self.queries[key])
with open(f'{self.output_path}{key}.json', 'w') as f:
f.write(json.dumps(q))
return len(self.queries.keys())
def run(self, con: sqlite3.Connection, context: TransformContext):
"""Executes the toD3 Node
Writes to a bunch of files, each for each query.
Args:
con (sqlite3.Connection): SQLite database connection
context (TransformContext): TransformContext, containing the input
dataframe of all post.
Returns:
TransformContext with processed dataframe.
"""
logger.info("Starting ToD3Node transformation")
if not os.path.isdir(self.output_path):
logger.warning(f"output_dir does not exist, creating dir...")
os.mkdir(self.output_path)
count = self._calculate_files(con)
logger.info(f"Successfully generated {count} json files.")
return context
def main():
import sys
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
# Connect to database
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
con = sqlite3.connect(db_path)
try:
context = TransformContext(None)
node = ToD3Node('/Users/linussilberstein/Documents/Knack-Scraper/data/json/')
context = node.run(con, context)
except Exception as e:
logger.error(f"Error during transformation: {e}", exc_info=True)
raise
finally:
con.close()
if __name__ == '__main__':
main()