{ "cells": [ { "cell_type": "code", "execution_count": 12, "id": "0ab5f064", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Libraries imported successfully!\n" ] } ], "source": [ "import sqlite3\n", "from pathlib import Path\n", "import json\n", "\n", "print(\"Libraries imported successfully!\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "94b2e3d9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tables in the database:\n", " - posttags\n", " - postcategories\n", " - tags\n", " - categories\n", " - posts\n", " - authors\n", " - post_authors\n", " - sqlite_sequence\n", " - urls\n" ] } ], "source": [ "# Connect to the database\n", "db_path = Path('../data/knack.sqlite')\n", "conn = sqlite3.connect(db_path)\n", "cursor = conn.cursor()\n", "\n", "# Get all table names\n", "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", "tables = cursor.fetchall()\n", "\n", "print(\"Tables in the database:\")\n", "for table in tables:\n", " print(f\" - {table[0]}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "b3924728", "metadata": {}, "outputs": [], "source": [ "def query_db(query, args=(), one=False):\n", " cursor.execute(query, args)\n", " r = [dict((cursor.description[i][0], value) \\\n", " for i, value in enumerate(row)) for row in cursor.fetchall()]\n", " return (r[0] if r else None) if one else r" ] }, { "cell_type": "code", "execution_count": 15, "id": "c0fdb0ba", "metadata": {}, "outputs": [], "source": [ "q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n", "\n", "with open('json/tags.json', 'w') as file:\n", " file.write(json.dumps(q))" ] }, { "cell_type": "code", "execution_count": 16, "id": "df5c31b3", "metadata": {}, "outputs": [], "source": [ "q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n", "\n", "with open('json/categories.json', 'w') as file:\n", " file.write(json.dumps(q))" ] }, { "cell_type": "code", "execution_count": 17, "id": "101b971d", "metadata": {}, "outputs": [], "source": [ "q = query_db(\"\"\"\n", "SELECT\n", " strftime('%Y-%m', date) AS month,\n", " category,\n", " COUNT(*) AS count\n", "FROM posts\n", "WHERE date > '2020-01-01' AND category NOT NULL\n", "GROUP BY strftime('%Y-%m', date), category\n", "ORDER BY month;\n", " \"\"\")\n", "\n", "with open('json/posts_per_month.json', 'w') as file:\n", " file.write(json.dumps(q))" ] }, { "cell_type": "code", "execution_count": 18, "id": "2f23046d", "metadata": {}, "outputs": [], "source": [ "q = query_db(\"\"\"\n", "select name,\n", " min(type) as type,\n", " count(posts.id) as count\n", "from authors\n", "inner join post_authors on authors.id = author_id\n", "inner join posts on posts.id = post_id\n", " \n", "where category NOT like '%Presseartikel%'\n", " \n", "group by name\n", " \n", "order by count desc\n", "limit 25\n", "\"\"\")\n", "\n", "with open('json/authors.json', 'w') as file:\n", " file.write(json.dumps(q))" ] }, { "cell_type": "code", "execution_count": 19, "id": "d4ae65f1", "metadata": { "vscode": { "languageId": "ruby" } }, "outputs": [], "source": [ "tag_pairs = query_db(\"\"\"\n", " SELECT t1.tag AS source,\n", " t2.tag AS target,\n", " COUNT(*) AS weight\n", " FROM posttags pt1\n", " JOIN posttags pt2\n", " ON pt1.post_id = pt2.post_id\n", " AND pt1.tag_id < pt2.tag_id\n", " JOIN tags t1 ON t1.id = pt1.tag_id\n", " JOIN tags t2 ON t2.id = pt2.tag_id\n", " GROUP BY t1.tag, t2.tag\n", " HAVING weight > 1\n", " ORDER BY weight DESC;\n", "\"\"\")\n", "\n", "with open('json/tag_chords.json', 'w') as f:\n", " f.write(json.dumps(tag_pairs))" ] }, { "cell_type": "code", "execution_count": 20, "id": "13062474", "metadata": { "vscode": { "languageId": "ruby" } }, "outputs": [], "source": [ "q = query_db(\"\"\"\n", "select\n", "cast(umap_x*10 as int) as x,\n", "cast(umap_y*10 as int) as y,\n", "cast(umap_z*10 as int) as z,\n", "posts.id as id, category_id as c,\n", "SUBSTRING(title, 1, 12) as t\n", "\n", "from posts\n", "inner join postcategories on post_id = posts.id\n", "inner join categories on category_id = categories.id\n", "\n", "\"\"\")\n", "\n", "#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n", "\n", "with open('json/umap_embeddings.json', 'w') as f:\n", " f.write(json.dumps(q))" ] }, { "cell_type": "code", "execution_count": 21, "id": "e5378b17", "metadata": { "vscode": { "languageId": "ruby" } }, "outputs": [], "source": [ "q = query_db(\"\"\"\n", "SELECT \n", "'knack[punkt]news' AS source, \n", "CASE \n", " WHEN tld_count < 10 THEN 'other'\n", " ELSE tld \n", "END AS target, \n", "SUM(tld_count) AS value\n", "FROM (\n", " SELECT tld, COUNT(*) as tld_count\n", " FROM urls \n", " WHERE tld IS NOT NULL \n", " GROUP BY tld\n", ")\n", "GROUP BY target\n", "\"\"\")\n", "\n", "q2 = query_db(\"\"\"\n", "SELECT \n", " tld AS source, \n", " CASE \n", " WHEN host_count < 10 THEN 'other'\n", " ELSE host \n", " END AS target, \n", " SUM(host_count) AS value\n", "FROM (\n", " SELECT tld, host, COUNT(*) as host_count\n", " FROM urls \n", " WHERE tld IS NOT NULL AND host IS NOT NULL \n", " GROUP BY tld, host\n", ")\n", "WHERE source != \"\" AND target != 'other'\n", "GROUP BY tld, target\n", "\"\"\")\n", "\n", "with open('json/urls_l1.json', 'w') as f:\n", " f.write(json.dumps(q))\n", "\n", "with open('json/urls_l2.json', 'w') as f:\n", " f.write(json.dumps(q2))" ] }, { "cell_type": "code", "execution_count": 22, "id": "1501cb06", "metadata": { "vscode": { "languageId": "ruby" } }, "outputs": [ { "data": { "text/plain": [ "[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n", " {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n", " {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n", " {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n", " {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n", " {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n", " {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n", " {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n", " {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n", " {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n", " {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = query_db(\"\"\"\n", " SELECT \n", " a.name AS author_name,\n", " t.tag,\n", " COUNT(*) AS tag_count\n", "FROM authors a\n", "JOIN post_authors pa ON a.id = pa.author_id\n", "JOIN posttags pt ON pa.post_id = pt.post_id\n", "JOIN tags t ON pt.tag_id = t.id\n", "WHERE a.name = 'Antifa'\n", "GROUP BY a.id, a.name, t.id, t.tag\n", "ORDER BY tag_count DESC;\n", "\"\"\")\n", "\n", "q" ] } ], "metadata": { "kernelspec": { "display_name": "knack-viz", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.14" } }, "nbformat": 4, "nbformat_minor": 5 }