Knack-Scraper/visualisation/tojson.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0ab5f064",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Libraries imported successfully!\n"
     ]
    }
   ],
   "source": [
    "import sqlite3\n",
    "from pathlib import Path\n",
    "import json\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "94b2e3d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tables in the database:\n",
      "  - posttags\n",
      "  - postcategories\n",
      "  - tags\n",
      "  - categories\n",
      "  - posts\n",
      "  - authors\n",
      "  - post_authors\n",
      "  - sqlite_sequence\n",
      "  - urls\n"
     ]
    }
   ],
   "source": [
    "# Connect to the database\n",
    "db_path = Path('../data/knack.sqlite')\n",
    "conn = sqlite3.connect(db_path)\n",
    "cursor = conn.cursor()\n",
    "\n",
    "# Get all table names\n",
    "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
    "tables = cursor.fetchall()\n",
    "\n",
    "print(\"Tables in the database:\")\n",
    "for table in tables:\n",
    "    print(f\"  - {table[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b3924728",
   "metadata": {},
   "outputs": [],
   "source": [
    "def query_db(query, args=(), one=False):\n",
    "    cursor.execute(query, args)\n",
    "    r = [dict((cursor.description[i][0], value) \\\n",
    "               for i, value in enumerate(row)) for row in cursor.fetchall()]\n",
    "    return (r[0] if r else None) if one else r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c0fdb0ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db('select tag, count(id) as count from tags inner join posttags on id = tag_id group by tag order by count desc limit 35')\n",
    "\n",
    "with open('json/tags.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "df5c31b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db('select category, count(id) as count from categories inner join postcategories on id = category_id group by category order by count desc limit 35;')\n",
    "\n",
    "with open('json/categories.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "101b971d",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "SELECT\n",
    "             strftime('%Y-%m', date) AS month,\n",
    "             category,\n",
    "             COUNT(*) AS count\n",
    "FROM posts\n",
    "WHERE date > '2020-01-01' AND category NOT NULL\n",
    "GROUP BY strftime('%Y-%m', date), category\n",
    "ORDER BY month;\n",
    "             \"\"\")\n",
    "\n",
    "with open('json/posts_per_month.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2f23046d",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "select name,\n",
    "    min(type) as type,\n",
    "    count(posts.id) as count\n",
    "from authors\n",
    "inner join post_authors on authors.id = author_id\n",
    "inner join posts on posts.id = post_id\n",
    "             \n",
    "where category NOT like '%Presseartikel%'\n",
    "             \n",
    "group by name\n",
    "             \n",
    "order by count desc\n",
    "limit 25\n",
    "\"\"\")\n",
    "\n",
    "with open('json/authors.json', 'w') as file:\n",
    "    file.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d4ae65f1",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "tag_pairs = query_db(\"\"\"\n",
    "    SELECT t1.tag AS source,\n",
    "           t2.tag AS target,\n",
    "           COUNT(*) AS weight\n",
    "    FROM posttags pt1\n",
    "    JOIN posttags pt2\n",
    "        ON pt1.post_id = pt2.post_id\n",
    "       AND pt1.tag_id < pt2.tag_id\n",
    "    JOIN tags t1 ON t1.id = pt1.tag_id\n",
    "    JOIN tags t2 ON t2.id = pt2.tag_id\n",
    "    GROUP BY t1.tag, t2.tag\n",
    "    HAVING weight > 3\n",
    "    ORDER BY weight DESC;\n",
    "\"\"\")\n",
    "\n",
    "with open('json/tag_chords.json', 'w') as f:\n",
    "    f.write(json.dumps(tag_pairs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "13062474",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "select\n",
    "round(umap_x, 3) as umap_x,\n",
    "round(umap_y, 3) as umap_y,\n",
    "round(umap_z, 3) as umap_z,\n",
    "posts.id, title\n",
    "\n",
    "from posts\n",
    "inner join postcategories on post_id = posts.id\n",
    "inner join categories on category_id = categories.id\n",
    "where date > '2020-01-01' and categories.category IN ('Theorie und Diskussion', 'Praxis')\n",
    "\"\"\")\n",
    "\n",
    "with open('json/umap_embeddings.json', 'w') as f:\n",
    "    f.write(json.dumps(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5378b17",
   "metadata": {
    "vscode": {
     "languageId": "ruby"
    }
   },
   "outputs": [],
   "source": [
    "q = query_db(\"\"\"\n",
    "SELECT \n",
    "'knack[punkt]news' AS source, \n",
    "CASE \n",
    "    WHEN tld_count < 10 THEN 'other'\n",
    "    ELSE tld \n",
    "END AS target, \n",
    "SUM(tld_count) AS value\n",
    "FROM (\n",
    "    SELECT tld, COUNT(*) as tld_count\n",
    "    FROM urls \n",
    "    WHERE tld IS NOT NULL \n",
    "    GROUP BY tld\n",
    ")\n",
    "GROUP BY target\n",
    "\"\"\")\n",
    "\n",
    "q2 = query_db(\"\"\"\n",
    "SELECT \n",
    "    tld AS source, \n",
    "    CASE \n",
    "        WHEN host_count < 15 THEN 'other'\n",
    "        ELSE host \n",
    "    END AS target, \n",
    "    SUM(host_count) AS value\n",
    "FROM (\n",
    "    SELECT tld, host, COUNT(*) as host_count\n",
    "    FROM urls \n",
    "    WHERE tld IS NOT NULL AND host IS NOT NULL \n",
    "    GROUP BY tld, host\n",
    ")\n",
    "WHERE source != \"\"\n",
    "GROUP BY tld, target\n",
    "\"\"\")\n",
    "\n",
    "with open('json/urls_l1.json', 'w') as f:\n",
    "    f.write(json.dumps(q))\n",
    "\n",
    "with open('json/urls_l2.json', 'w') as f:\n",
    "    f.write(json.dumps(q2))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "knack-viz",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}