forked from lukaszett/Knack-Scraper
343 lines
9.2 KiB
Text
343 lines
9.2 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "0ab5f064",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Libraries imported successfully!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import sqlite3\n",
|
|
"from pathlib import Path\n",
|
|
"import json\n",
|
|
"\n",
|
|
"print(\"Libraries imported successfully!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "94b2e3d9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Tables in the database:\n",
|
|
" - posttags\n",
|
|
" - postcategories\n",
|
|
" - tags\n",
|
|
" - categories\n",
|
|
" - posts\n",
|
|
" - authors\n",
|
|
" - post_authors\n",
|
|
" - sqlite_sequence\n",
|
|
" - urls\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Connect to the database\n",
|
|
"db_path = Path('../data/knack.sqlite')\n",
|
|
"conn = sqlite3.connect(db_path)\n",
|
|
"cursor = conn.cursor()\n",
|
|
"\n",
|
|
"# Get all table names\n",
|
|
"cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
|
|
"tables = cursor.fetchall()\n",
|
|
"\n",
|
|
"print(\"Tables in the database:\")\n",
|
|
"for table in tables:\n",
|
|
" print(f\" - {table[0]}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "b3924728",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def query_db(query, args=(), one=False):\n",
|
|
" cursor.execute(query, args)\n",
|
|
" r = [dict((cursor.description[i][0], value) \\\n",
|
|
" for i, value in enumerate(row)) for row in cursor.fetchall()]\n",
|
|
" return (r[0] if r else None) if one else r"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "c0fdb0ba",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n",
|
|
"\n",
|
|
"with open('json/tags.json', 'w') as file:\n",
|
|
" file.write(json.dumps(q))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "df5c31b3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n",
|
|
"\n",
|
|
"with open('json/categories.json', 'w') as file:\n",
|
|
" file.write(json.dumps(q))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "101b971d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db(\"\"\"\n",
|
|
"SELECT\n",
|
|
" strftime('%Y-%m', date) AS month,\n",
|
|
" category,\n",
|
|
" COUNT(*) AS count\n",
|
|
"FROM posts\n",
|
|
"WHERE date > '2020-01-01' AND category NOT NULL\n",
|
|
"GROUP BY strftime('%Y-%m', date), category\n",
|
|
"ORDER BY month;\n",
|
|
" \"\"\")\n",
|
|
"\n",
|
|
"with open('json/posts_per_month.json', 'w') as file:\n",
|
|
" file.write(json.dumps(q))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "2f23046d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db(\"\"\"\n",
|
|
"select name,\n",
|
|
" min(type) as type,\n",
|
|
" count(posts.id) as count\n",
|
|
"from authors\n",
|
|
"inner join post_authors on authors.id = author_id\n",
|
|
"inner join posts on posts.id = post_id\n",
|
|
" \n",
|
|
"where category NOT like '%Presseartikel%'\n",
|
|
" \n",
|
|
"group by name\n",
|
|
" \n",
|
|
"order by count desc\n",
|
|
"limit 25\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"with open('json/authors.json', 'w') as file:\n",
|
|
" file.write(json.dumps(q))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "d4ae65f1",
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "ruby"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"tag_pairs = query_db(\"\"\"\n",
|
|
" SELECT t1.tag AS source,\n",
|
|
" t2.tag AS target,\n",
|
|
" COUNT(*) AS weight\n",
|
|
" FROM posttags pt1\n",
|
|
" JOIN posttags pt2\n",
|
|
" ON pt1.post_id = pt2.post_id\n",
|
|
" AND pt1.tag_id < pt2.tag_id\n",
|
|
" JOIN tags t1 ON t1.id = pt1.tag_id\n",
|
|
" JOIN tags t2 ON t2.id = pt2.tag_id\n",
|
|
" GROUP BY t1.tag, t2.tag\n",
|
|
" HAVING weight > 1\n",
|
|
" ORDER BY weight DESC;\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"with open('json/tag_chords.json', 'w') as f:\n",
|
|
" f.write(json.dumps(tag_pairs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "13062474",
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "ruby"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db(\"\"\"\n",
|
|
"select\n",
|
|
"cast(umap_x*10 as int) as x,\n",
|
|
"cast(umap_y*10 as int) as y,\n",
|
|
"cast(umap_z*10 as int) as z,\n",
|
|
"posts.id as id, category_id as c,\n",
|
|
"SUBSTRING(title, 1, 12) as t\n",
|
|
"\n",
|
|
"from posts\n",
|
|
"inner join postcategories on post_id = posts.id\n",
|
|
"inner join categories on category_id = categories.id\n",
|
|
"\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"#where date > '2020-01-01' and categories.category NOT IN ('Presseartikel')\n",
|
|
"\n",
|
|
"with open('json/umap_embeddings.json', 'w') as f:\n",
|
|
" f.write(json.dumps(q))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "e5378b17",
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "ruby"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"q = query_db(\"\"\"\n",
|
|
"SELECT \n",
|
|
"'knack[punkt]news' AS source, \n",
|
|
"CASE \n",
|
|
" WHEN tld_count < 10 THEN 'other'\n",
|
|
" ELSE tld \n",
|
|
"END AS target, \n",
|
|
"SUM(tld_count) AS value\n",
|
|
"FROM (\n",
|
|
" SELECT tld, COUNT(*) as tld_count\n",
|
|
" FROM urls \n",
|
|
" WHERE tld IS NOT NULL \n",
|
|
" GROUP BY tld\n",
|
|
")\n",
|
|
"GROUP BY target\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"q2 = query_db(\"\"\"\n",
|
|
"SELECT \n",
|
|
" tld AS source, \n",
|
|
" CASE \n",
|
|
" WHEN host_count < 10 THEN 'other'\n",
|
|
" ELSE host \n",
|
|
" END AS target, \n",
|
|
" SUM(host_count) AS value\n",
|
|
"FROM (\n",
|
|
" SELECT tld, host, COUNT(*) as host_count\n",
|
|
" FROM urls \n",
|
|
" WHERE tld IS NOT NULL AND host IS NOT NULL \n",
|
|
" GROUP BY tld, host\n",
|
|
")\n",
|
|
"WHERE source != \"\" AND target != 'other'\n",
|
|
"GROUP BY tld, target\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"with open('json/urls_l1.json', 'w') as f:\n",
|
|
" f.write(json.dumps(q))\n",
|
|
"\n",
|
|
"with open('json/urls_l2.json', 'w') as f:\n",
|
|
" f.write(json.dumps(q2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "1501cb06",
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "ruby"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[{'author_name': 'Antifa', 'tag': 'Antifaschismus', 'tag_count': 9},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Antirassismus', 'tag_count': 4},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Antisemitismus', 'tag_count': 4},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Feminismus', 'tag_count': 3},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Recherche', 'tag_count': 3},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Antisexismus', 'tag_count': 3},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Repression', 'tag_count': 2},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Diskussion', 'tag_count': 2},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Medien', 'tag_count': 2},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Solidarität', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Freiräume', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Knast', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Militanz', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Nationalsozialismus', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Auswertung', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Demonstration', 'tag_count': 1},\n",
|
|
" {'author_name': 'Antifa', 'tag': 'Krieg', 'tag_count': 1}]"
|
|
]
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"q = query_db(\"\"\"\n",
|
|
" SELECT \n",
|
|
" a.name AS author_name,\n",
|
|
" t.tag,\n",
|
|
" COUNT(*) AS tag_count\n",
|
|
"FROM authors a\n",
|
|
"JOIN post_authors pa ON a.id = pa.author_id\n",
|
|
"JOIN posttags pt ON pa.post_id = pt.post_id\n",
|
|
"JOIN tags t ON pt.tag_id = t.id\n",
|
|
"WHERE a.name = 'Antifa'\n",
|
|
"GROUP BY a.id, a.name, t.id, t.tag\n",
|
|
"ORDER BY tag_count DESC;\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"q"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "knack-viz",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|