491 lines
14 KiB
Text
491 lines
14 KiB
Text
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "cce66876",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Interface Presseportal"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "f12d7022",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "b07aef9f",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "258338d0",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"id": "b07fac3c",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import requests\n",
|
|||
|
"import calendar\n",
|
|||
|
"import time\n",
|
|||
|
"import os\n",
|
|||
|
"import csv\n",
|
|||
|
"\n",
|
|||
|
"from tqdm.notebook import tqdm\n",
|
|||
|
"from datetime import datetime\n",
|
|||
|
"from bs4 import BeautifulSoup"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "0dfce15a",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"id": "6c0b30a8",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"class Pressemitteilung:\n",
|
|||
|
" def __init__(self, article_id, timestamp, location, text, bundesland):\n",
|
|||
|
" self.article_id = article_id\n",
|
|||
|
" self.timestamp = timestamp\n",
|
|||
|
" self.location = location\n",
|
|||
|
" self.text = text\n",
|
|||
|
" self.bundesland=bundesland\n",
|
|||
|
" \n",
|
|||
|
" def __str__(self):\n",
|
|||
|
" return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n",
|
|||
|
" \n",
|
|||
|
" def to_row(self):\n",
|
|||
|
" return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "63cceebe",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Konstanten und Pfade**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"id": "8bcc877f",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"REQUEST_HEADERS = {\n",
|
|||
|
" \"User-Agent\": (\n",
|
|||
|
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n",
|
|||
|
" \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n",
|
|||
|
" )\n",
|
|||
|
"}"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"id": "c637ac38",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"DATA_FOLDER = os.path.join(\"..\", \"data\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"id": "f094dee0",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"BUNDESLAENDER = [\n",
|
|||
|
" \"baden-wuerttemberg\",\n",
|
|||
|
" \"bayern\",\n",
|
|||
|
" \"berlin-brandenburg\",\n",
|
|||
|
" \"bremen\",\n",
|
|||
|
" \"hamburg\",\n",
|
|||
|
" \"hessen\",\n",
|
|||
|
" \"mecklenburg-vorpommern\",\n",
|
|||
|
" \"niedersachsen\",\n",
|
|||
|
" \"nordrhein-westfalen\",\n",
|
|||
|
" \"rheinland-pfalz\",\n",
|
|||
|
" \"saarland\",\n",
|
|||
|
" \"sachsen\",\n",
|
|||
|
" \"sachsen-anhalt\",\n",
|
|||
|
" \"schleswig-holstein\",\n",
|
|||
|
" \"thueringen\",\n",
|
|||
|
"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"id": "84632391",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def requests_get(request):\n",
|
|||
|
" return requests.get(request, headers=REQUEST_HEADERS)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"id": "1af0bdbd",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def extract_response(response, bundesland=None):\n",
|
|||
|
" \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n",
|
|||
|
" \n",
|
|||
|
" Args:\n",
|
|||
|
" response (:obj:`Response`)\n",
|
|||
|
" bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n",
|
|||
|
" \n",
|
|||
|
" Returns:\n",
|
|||
|
" list of :obj:`Pressemitteilung`\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" \n",
|
|||
|
" mitteilungen = []\n",
|
|||
|
" \n",
|
|||
|
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
|||
|
" for article in soup.find_all('article'):\n",
|
|||
|
" data_url = article['data-url']\n",
|
|||
|
" article_id = '-'.join(article['data-url'].split('/')[-2:])\n",
|
|||
|
" meta = article.find('div')\n",
|
|||
|
" \n",
|
|||
|
" timestamp_str = meta.find(class_=\"date\")\n",
|
|||
|
" \n",
|
|||
|
" if timestamp_str is not None:\n",
|
|||
|
" timestamp_str = timestamp_str.text\n",
|
|||
|
" timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y – %H:%M')\n",
|
|||
|
" else:\n",
|
|||
|
" timestamp = None\n",
|
|||
|
" \n",
|
|||
|
" location_str = meta.find(class_=\"news-topic\")\n",
|
|||
|
" location_str = location_str.text if location_str is not None else None\n",
|
|||
|
" \n",
|
|||
|
" p_texts = article.findAll('p')\n",
|
|||
|
" if len(p_texts) > 1:\n",
|
|||
|
" text = p_texts[1].text\n",
|
|||
|
" else:\n",
|
|||
|
" text = ''\n",
|
|||
|
" \n",
|
|||
|
" mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n",
|
|||
|
" \n",
|
|||
|
" return mitteilungen"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "c62c06c9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n",
|
|||
|
" \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n",
|
|||
|
" \n",
|
|||
|
" Args:\n",
|
|||
|
" site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n",
|
|||
|
" location (:obj:`str`, default=None): Bundesland bzw. Stadt\n",
|
|||
|
" start_date (:obj:`str`, default=None)\n",
|
|||
|
" end_date (:obj:`str`, default=None)\n",
|
|||
|
" Returns:\n",
|
|||
|
" str: URL\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n",
|
|||
|
" \n",
|
|||
|
" if location is not None:\n",
|
|||
|
" url += f\"/l/{location}\"\n",
|
|||
|
" \n",
|
|||
|
" if site > 1:\n",
|
|||
|
" url += f\"/{site*30}\"\n",
|
|||
|
" \n",
|
|||
|
" if start_date is not None or end_date is not None:\n",
|
|||
|
" url += \"?\"\n",
|
|||
|
" \n",
|
|||
|
" if start_date is not None:\n",
|
|||
|
" url += f\"startDate={start_date}\"\n",
|
|||
|
" \n",
|
|||
|
" if end_date is not None:\n",
|
|||
|
" url += \"&\"\n",
|
|||
|
" \n",
|
|||
|
" if end_date is not None:\n",
|
|||
|
" url += f\"endDate={end_date}\"\n",
|
|||
|
" \n",
|
|||
|
" return url"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "1c67c9bc",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Beispiel: Hamburg "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "aff924d6",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n",
|
|||
|
"url"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "6e2b9091",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n",
|
|||
|
"[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n",
|
|||
|
"[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n",
|
|||
|
"[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n",
|
|||
|
"[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for mitteilung in extract_response(requests_get(url))[:5]:\n",
|
|||
|
" print(mitteilung)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "e50af557",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Effizientes Einlesen"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "b4a9580a",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"id": "da927e30",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n",
|
|||
|
" \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n",
|
|||
|
"\n",
|
|||
|
" meldungen = []\n",
|
|||
|
" site = 1\n",
|
|||
|
" \n",
|
|||
|
" start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
|
|||
|
" end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
|
|||
|
" request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n",
|
|||
|
" \n",
|
|||
|
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
|
|||
|
" meldungen.extend(new_meldungen)\n",
|
|||
|
" \n",
|
|||
|
" pbar = tqdm(desc=bundesland)\n",
|
|||
|
" while len(new_meldungen) != 0:\n",
|
|||
|
" time.sleep(1)\n",
|
|||
|
" site += 1\n",
|
|||
|
" \n",
|
|||
|
" request = create_get_request(\n",
|
|||
|
" site=site, location=bundesland, start_date=start_date, end_date=end_date,\n",
|
|||
|
" )\n",
|
|||
|
" \n",
|
|||
|
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
|
|||
|
" meldungen.extend(new_meldungen)\n",
|
|||
|
" pbar.update(1)\n",
|
|||
|
" pbar.close()\n",
|
|||
|
" \n",
|
|||
|
" return meldungen"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"id": "85508758",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def get_meldungen_for_date(year, month, day):\n",
|
|||
|
" \"\"\"Extrahiere alle Meldungen für einen Tag\n",
|
|||
|
" \n",
|
|||
|
" Args:\n",
|
|||
|
" year (int): Jahr\n",
|
|||
|
" month (int): Monat\n",
|
|||
|
" day (int): Tag\n",
|
|||
|
" \"\"\"\n",
|
|||
|
"\n",
|
|||
|
" meldungen_dict = {}\n",
|
|||
|
" \n",
|
|||
|
" for bundesland in BUNDESLAENDER:\n",
|
|||
|
" meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n",
|
|||
|
" meldungen_dict[bundesland] = meldungen\n",
|
|||
|
" \n",
|
|||
|
" return meldungen_dict"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "f938d8a9",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Speichern der Daten in CSV-Dateien"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "67374d3b",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"id": "276e700d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def store_meldungen_in_csv(year, month, day):\n",
|
|||
|
" \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n",
|
|||
|
"\n",
|
|||
|
" filename = f\"{year}-{month}-{day}_presseportal.csv\"\n",
|
|||
|
" path = os.path.join(DATA_FOLDER, filename)\n",
|
|||
|
" meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n",
|
|||
|
" \n",
|
|||
|
" with open(path, 'w', newline='', encoding='UTF8') as f:\n",
|
|||
|
" writer = csv.writer(f)\n",
|
|||
|
" writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n",
|
|||
|
" \n",
|
|||
|
" for bundesland, meldungen in meldungen_per_bundesland.items():\n",
|
|||
|
" for meldung in meldungen:\n",
|
|||
|
" writer.writerow(meldung.to_row())\n",
|
|||
|
" \n",
|
|||
|
" print(f\"File '{filename}' created\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"id": "c5d0bdbd",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def store_month(year, month):\n",
|
|||
|
" month_end_day = calendar.monthrange(year, month)[1]\n",
|
|||
|
" \n",
|
|||
|
" for i in range(0, month_end_day):\n",
|
|||
|
" store_meldungen_in_csv(year, month, i+1)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "d9f3e24b",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Auswertung: Wie viele Einträge pro Bundesland?"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "9f600d3c",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"id": "b7c85078",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"counter = {}\n",
|
|||
|
"\n",
|
|||
|
"for filename in os.listdir('../data/'):\n",
|
|||
|
" if filename.endswith(\"_presseportal.csv\"):\n",
|
|||
|
" path = '../data/' + filename\n",
|
|||
|
" \n",
|
|||
|
" with open(path, 'r', encoding='UTF8') as f_in:\n",
|
|||
|
" reader = csv.reader(f_in)\n",
|
|||
|
" next(reader)\n",
|
|||
|
" for row in reader:\n",
|
|||
|
" bundesland = row[3]\n",
|
|||
|
" if bundesland not in counter:\n",
|
|||
|
" counter[bundesland] = 1\n",
|
|||
|
" else:\n",
|
|||
|
" counter[bundesland] += 1\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "python-scientific kernel",
|
|||
|
"language": "python",
|
|||
|
"name": "python-scientific"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.10.9"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|