This commit is contained in:
Peter Kannewitz 2023-03-26 18:36:49 +02:00
commit 8d3c8b3974
77 changed files with 682928 additions and 0 deletions

View file

@ -0,0 +1,20 @@
# CopBird Group 16
## Tools
* [Python](https://www.python.org/) Version >= 3.8
* [Wekan](https://wekan.github.io/) als Arbeitsgrundlage mit einem Gruppen-Wekan-Board sowie Gesamt-Boards
* [Matrix](https://matrix.org/) zur Kommunikation
Jupyter Notebook verwendet Kernels. Um ein Environment als Kernel zu verwenden, gibt es folgende
Anleitung: https://queirozf.com/entries/jupyter-kernels-how-to-add-change-remove
## Daten
Die Tweets können nicht öffentlich gemacht werden, jedoch sind die Pressemitteilungen und Sentiment-Wörter unter data/ zu finden.
## Ergebnisse
Die wichtigsten Ergebnisse befinden sich in der [Presse-vs.-Twitter](Presse-vs.-Twitter.pdf) PDF.
In den notebooks finden sich zusätzliche Details zur Datenextraktion und Analyse.

Binary file not shown.

View file

@ -0,0 +1,164 @@
Polizei Account Name Typ Bundesland Stadt LAT LONG
bpol_11 Bundespolizei Spezialkräfte Bundespolizei - - -
bpol_bepo Bundesbereitschaftspolizei Bundesbereitschaftspolizei - - - -
bpol_air_fra Bundespolizei Flughafen Frankfurt am Main Bundespolizei Hessen Frankfurt am Main 50.1109221 8.6821267
bpol_b Bundespolizei Berlin Bundespolizei Berlin Berlin 52.520007 13.404954
bpol_b_einsatz Bundespolizei Berlin Einsatz Bundespolizei Berlin Berlin 52.520007 13.404954
bpol_bw Bundespolizei Baden-Württemberg Bundespolizei Baden-Württemberg Böblingen 48.681331 9.008830
bpol_by Bundespolizei Bayern Bundespolizei Bayern München 48.135125 11.581981
bpol_koblenz Bundespolizei Koblenz Bundespolizei Rheinland-Pfalz Koblenz 50.356943 7.588996
bpol_kueste Bundespolizei Küste Bundespolizei Schleswig-Holstein Bad Bramstedt 53.919582 9.882173
bpol_nord Bundespolizei Nord Bundespolizei Niedersachsen Hannover 52.3744779 9.7385532
bpol_nrw Bundespolizei NRW Bundespolizei Nordrhein-Westfalen Sankt Augustin 50.769478 7.187579
bpol_pir Bundespolizei Mitteldeutschland Bundespolizei Sachsen-Anhalt Magdeburg 52.120533 11.627624
bremenpolizei Polizei Bremen Polizei Bremen Bremen 53.079296 8.801694
lkabawue Landeskriminalamt BW Landeskriminalamt Baden-Württemberg Stuttgart 48.775846 9.182932
lka_bayern Bayerisches Landeskriminalamt Landeskriminalamt Bayern München 48.135125 11.581981
lka_hessen Hessisches Landeskriminalamt Landeskriminalamt Hessen Wiesbaden 50.0820384 8.2416556
lka_rlp Landeskriminalamt Rheinland-Pfalz Landeskriminalamt Rheinland-Pfalz Mainz 49.992862 8.247253
pol_grafschaft Polizei Grf Bentheim Polizei Niedersachsen Nordhorn 52.429580 7.068571
polizeiaalen Polizei Aalen Polizei Baden-Württemberg Aalen 48.836689 10.097116
polizei_aur_wtm Polizei Aurich / WTM Polizei Niedersachsen Aurich 53.470839 7.484831
polizei_badn Polizei Bad Nenndorf Polizei Niedersachsen Bad Nenndorf 52.336191 9.374258
polizeibayern Polizei Bayern Polizei Bayern München 48.135125 11.581981
polizeibb Polizei Brandenburg Polizei Brandenburg Potsdam 52.390569 13.064473
polizeibb_e PolizeiBrandenburg_E Polizei Brandenburg Potsdam 52.390569 13.064473
polizei_bbg Polizei Bückeburg Polizei Niedersachsen Bückeburg 52.259276 9.052123
polizeiberlin Polizei Berlin Polizei Berlin Berlin 52.520007 13.404954
polizeiberlin_e Polizei Berlin Einsatz Polizei Berlin Berlin 52.520007 13.404954
polizeibhv Polizei Bremerhaven Polizei Bremen Bremerhaven 53.539584 8.580942
polizei_bs Polizei Braunschweig Polizei Niedersachsen Braunschweig 52.268874 10.526770
polizei_ce Polizei Celle Polizei Niedersachsen Celle 52.617596 10.062852
polizei_clp_vec Polizei Cloppenburg/Vechta Polizei Niedersachsen Cloppenburg 52.844198 8.053016
polizei_cux Polizei Cuxhaven Polizei Niedersachsen Cuxhaven 53.859336 8.687906
polizei_del Polizei Delmenhorst/Oldenburg-Land/Wesermarsch Polizei Niedersachsen Delmenhorst 53.052189 8.635593
polizei_dero Polizei Dessau-Roßlau Polizei Sachsen-Anhalt Dessau-Roßlau 51.842828 12.230393
polizei_dh Polizei Diepholz Polizei Niedersachsen Diepholz 52.605646 8.370788
polizei_el Polizei Emsland Polizei Niedersachsen Lingen 52.540308 7.329286
polizei_ffm Polizei Frankfurt Polizei Hessen Frankfurt am Main 50.110922 8.682127
polizeifr Polizei Freiburg Polizei Baden-Württemberg Freiburg 47.999008 7.842104
polizei_ft Polizei Frankenthal Polizei Rheinland-Pfalz Frankenthal 49.533333 8.350000
polizei_ger Polizei Germersheim Polizei Rheinland-Pfalz Germersheim 49.214024 8.366815
polizei_gf Polizei Gifhorn Polizei Niedersachsen Gifhorn 52.480909 10.550783
polizei_goe Polizei Göttingen Polizei Niedersachsen Göttingen 51.541280 9.915804
polizei_gs Polizei Goslar Polizei Niedersachsen Goslar 51.905953 10.428996
polizei_h Polizei Hannover Polizei Niedersachsen Hannover 52.3744779 9.7385532
polizei_hal Polizei Halle (Saale) Polizei Sachsen-Anhalt Halle (Saale) 51.4825041 11.9705452
polizeihamburg Polizei Hamburg Polizei Hamburg Hamburg 53.550341 10.000654
polizei_hi Polizei Hildesheim Polizei Niedersachsen Hildesheim 52.1521636 9.9513046
polizei_hk Polizei Heidekreis Polizei Niedersachsen Soltau 52.9859666 9.8433909
polizei_hm Polizei Hameln Polizei Niedersachsen Hameln-Pyrmont 52.0895789 9.3875409
polizeihn Polizei Heilbronn Polizei Baden-Württemberg Heilbronn 49.142291 9.218655
polizei_hol Polizei Holzminden Polizei Niedersachsen Holzminden 51.828835 9.4466591
polizei_hst Polizei Stralsund Polizei Mecklenburg-Vorpommern Stralsund 54.3096314 13.0820846
polizei_ka Polizei Karlsruhe Polizei Baden-Württemberg Karlsruhe 49.0068705 8.4034195
polizei_kl Polizei Kaiserslautern Polizei Rheinland-Pfalz Kaiserslautern 49.4432174 7.7689951
polizei_ko Polizei Koblenz Polizei Rheinland-Pfalz Koblenz 50.3533278 7.5943951
polizeikonstanz Polizei Konstanz Polizei Baden-Württemberg Konstanz 47.659216 9.1750718
polizeilb Polizei Ludwigsburg Polizei Baden-Württemberg Ludwigsburg 48.8953937 9.1895147
polizei_ler_emd Polizei Leer / Emden Polizei Niedersachsen Leer 53.2327625 7.4577265
polizei_lg Polizei Lüneburg Polizei Niedersachsen Lüneburg 53.248706 10.407855
polizeimainz Polizei Mainz Polizei Rheinland-Pfalz Mainz 50.0012314 8.2762513
polizeimannheim Polizei Mannheim Polizei Baden-Württemberg Mannheim 49.4892913 8.4673098
polizei_md Polizei Magdeburg Polizei Sachsen-Anhalt Magdeburg 52.1315889 11.6399609
polizeimfr Polizei Mittelfranken Polizei Bayern Nürnberg 49.453872 11.077298
polizei_mh Polizei Mittelhessen Polizei Hessen Gießen 50.5862066 8.6742306
polizei_mse Polizei Mecklenburgische Seenplatte Polizei Mecklenburg-Vorpommern Neubrandenburg 53.5574458 13.2602781
polizeimuenchen Polizei München Polizei Bayern München 48.135125 11.581981
polizeinb Polizei Niederbayern Polizei Bayern Straubing 48.8819801 12.569716
polizei_nbg Polizei Nienburg Polizei Niedersachsen Nienburg (Weser) 52.6487602 9.2578105
polizeineustadt Polizei Neustadt Polizei Rheinland-Pfalz Neustadt an der Weinstraße 49.3539802 8.1350021
polizei_nh Polizei Nordhessen Polizei Hessen Kassel 51.3154546 9.4924096
polizeini_lka LKA Niedersachsen Landeskriminalamt Niedersachsen Hannover 52.3744779 9.7385532
polizei_nom Polizei Northeim Polizei Niedersachsen Northeim 51.705401 9.9972782
polizei_nrw_ac Polizei NRW AC Polizei Nordrhein-Westfalen Aachen 50.776351 6.083862
polizei_nrw_bi Polizei NRW BI Polizei Nordrhein-Westfalen Bielefeld 52.0191005 8.531007
polizei_nrw_bn Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 7.10066
polizei_nrw_bo Polizei NRW BO Polizei Nordrhein-Westfalen Bochum 51.4818111 7.2196635
polizei_nrw_bor Polizei NRW BOR Polizei Nordrhein-Westfalen Borken 51.8443183 6.8582247
polizei_nrw_coe Polizei NRW COE Polizei Nordrhein-Westfalen Coesfeld 51.9458943 7.1691108
polizei_nrw_d Polizei NRW D Polizei Nordrhein-Westfalen Düsseldorf 51.2254018 6.7763137
polizei_nrw_dn Polizei NRW DN Polizei Nordrhein-Westfalen Düren 50.8031684 6.4820806
polizei_nrw_do Polizei NRW DO Polizei Nordrhein-Westfalen Dortmund 51.5142273 7.4652789
polizei_nrw_du Polizei NRW DU Polizei Nordrhein-Westfalen Duisburg 51.434999 6.759562
polizei_nrw_e Polizei NRW E Polizei Nordrhein-Westfalen Essen 51.4582235 7.0158171
polizei_nrw_en Polizei NRW EN Polizei Nordrhein-Westfalen Ennepe-Ruhr-Kreis 51.3481444 7.3351844
polizei_nrw_eu Polizei NRW EU Polizei Nordrhein-Westfalen Euskirchen 50.6612623 6.7871219
polizei_nrw_ge Polizei NRW GE Polizei Nordrhein-Westfalen Gelsenkirchen 51.5110321 7.0960124
polizei_nrw_gm Polizei NRW GM Polizei Nordrhein-Westfalen Gummersbach 51.0277658 7.5630545
polizei_nrw_gt Polizei NRW GT Polizei Nordrhein-Westfalen Gütersloh 51.9063997 8.3782078
polizei_nrw_ha Polizei NRW HA Polizei Nordrhein-Westfalen Hagen 51.3582945 7.473296
polizei_nrw_ham Polizei NRW HAM Polizei Nordrhein-Westfalen Hamm 51.6804093 7.815197
polizei_nrw_hf Polizei NRW HF Polizei Nordrhein-Westfalen Herford 52.1152245 8.6711118
polizei_nrw_hs Polizei NRW HS Polizei Nordrhein-Westfalen Heinsberg 51.0654268 6.0984461
polizei_nrw_hsk Polizei NRW HSK Polizei Nordrhein-Westfalen Hochsauerlandkreis 51.3208247 8.2684925
polizei_nrw_hx Polizei NRW HX Polizei Nordrhein-Westfalen Höxter 51.7747369 9.3816877
polizei_nrw_k Polizei NRW K Polizei Nordrhein-Westfalen Köln 50.938361 6.959974
polizei_nrw_kle Polizei NRW KLE Polizei Nordrhein-Westfalen Kleve 51.7854839 6.1313674
polizei_nrw_kr Polizei NRW KR Polizei Nordrhein-Westfalen Krefeld 51.3331205 6.5623343
polizei_nrw_lip Polizei NRW LIP Polizei Nordrhein-Westfalen Detmold 51.936284 8.8791526
polizei_nrw_lka Polizei NRW LKA Landeskriminalamt Nordrhein-Westfalen Düsseldorf 51.2254018 6.7763137
polizei_nrw_me polizei_nrw_me Polizei Nordrhein-Westfalen Mettmann 51.2527778 6.9777778
polizei_nrw_mg Polizei NRW MG Polizei Nordrhein-Westfalen Mönchengladbach 51.1946983 6.4353641
polizei_nrw_mi Polizei NRW MI Polizei Nordrhein-Westfalen Minden 52.2881045 8.9168852
polizei_nrw_mk Polizei NRW MK Polizei Nordrhein-Westfalen Märkischer Kreis 51.2734857 7.7274266
polizei_nrw_ms Polizei NRW MS Polizei Nordrhein-Westfalen Münster 51.9625101 7.6251879
polizei_nrw_ob Polizei NRW OB Polizei Nordrhein-Westfalen Oberhausen 51.4696137 6.8514435
polizei_nrw_oe Polizei NRW OE Polizei Nordrhein-Westfalen Olpe 51.0297603 7.8424193
polizei_nrw_pb Polizei NRW PB Polizei Nordrhein-Westfalen Paderborn 51.7189596 8.7648698
polizei_nrw_rbk Polizei NRW RBK Polizei Nordrhein-Westfalen Rheinisch-Bergischer-Kreis 51.0139774 7.1715584
polizei_nrw_re Polizei NRW RE Polizei Nordrhein-Westfalen Recklinghausen 51.6143815 7.1978546
polizei_nrw_rek Polizei NRW REK Polizei Nordrhein-Westfalen Rhein-Erft-Kreis 50.90334 6.763334
polizei_nrw_rkn Polizei NRW RKN Polizei Nordrhein-Westfalen Rhein-Kreis Neuss 51.1758799 6.6600606
polizei_nrw_si Polizei NRW SI Polizei Nordrhein-Westfalen Siegen-Wittgenstein 50.97444 8.23972
polizei_nrw_so Polizei NRW SO Polizei Nordrhein-Westfalen Soest 51.5725501 8.1061259
polizei_nrw_st Polizei NRW ST Polizei Nordrhein-Westfalen Steinfurt 52.1294289 7.3903454
polizei_nrw_su Polizei NRW SU Polizei Nordrhein-Westfalen Rhein-Sieg-Kreis 50.7527986 7.3813038
polizei_nrw_un Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 7.689014
polizei_nrw_vie Polizei NRW VIE Polizei Nordrhein-Westfalen Viersen 51.2562118 6.3905476
polizei_nrw_w Polizei NRW W Polizei Nordrhein-Westfalen Wuppertal 51.264018 7.1780374
polizei_nrw_waf Polizei NRW WAF Polizei Nordrhein-Westfalen Warendorf 51.9532449 7.9912335
polizei_nrw_wes Polizei NRW WES Polizei Nordrhein-Westfalen Wesel 51.6576909 6.617087
polizeiobn Polizei Oberbayern N Polizei Bayern Ingolstadt 48.7630165 11.4250395
polizeiobs PolizeiOberbayernSüd Polizei Bayern Rosenheim 47.8539273 12.127262
polizeiofr Polizei Oberfranken Polizei Bayern Oberfranken 50.0553084 11.5455233
polizeiog Polizei Offenburg Polizei Baden-Württemberg Offenburg 48.4716556 7.944378
polizei_oh Polizei Osthessen Polizei Hessen Fulda 50.5521486 9.676511
polizei_oha Polizei Osterode Polizei Niedersachsen Osterode am Harz 51.72784 10.2508204
polizei_ol Polizei Oldenburg-Stadt/Ammerland Polizei Niedersachsen Oldenburg 53.1389753 8.2146017
polizeiopf Polizei Oberpfalz Polizei Bayern Regensburg 49.0195333 12.0974869
polizei_os Polizei Osnabrück Polizei Niedersachsen Osnabrück 52.266837 8.049741
polizei_pf Polizei Pforzheim Polizei Baden-Württemberg Pforzheim 48.8908846 8.7029532
polizei_pp_nb Polizeipräsidium NB Polizeipräsidium Mecklenburg-Vorpommern Neubrandenburg 53.5574458 13.2602781
polizei_pp_ros Polizeipräsidium Rostock Polizeipräsidium Mecklenburg-Vorpommern Rostock 54.0924445 12.1286127
polizei_ps Polizei Pirmasens Polizei Rheinland-Pfalz Pirmasens 49.1996961 7.6087847
polizei_rostock Polizei Rostock Polizei Mecklenburg-Vorpommern Rostock 54.0924445 12.1286127
polizei_row Polizei Rotenburg Polizei Niedersachsen Rotenburg (Wümme) 53.2520924 9.3151133
polizeirt Polizei Reutlingen Polizei Baden-Württemberg Reutlingen 48.4919508 9.2114144
polizeirv Polizei Ravensburg Polizei Baden-Württemberg Ravensburg 47.7811014 9.612468
polizeisaarland Polizei Saarland Polizei Saarland Saarbrücken 49.234362 6.996379
polizeisachsen Polizei Sachsen Polizei Sachsen Dresden 51.0493286 13.7381437
polizei_sdl Polizei Stendal Polizei Sachsen-Anhalt Stendal 52.6050782 11.8594279
polizei_sn Polizei Schwerin Polizei Mecklenburg-Vorpommern Schwerin 53.6288297 11.4148038
polizei_soh Polizei Südosthessen Polizei Hessen Offenbach am Main 50.1055002 8.7610698
polizei_std Polizei Stade Polizei Niedersachsen Stade 53.599794 9.475438
polizei_sth Polizei Stadthagen Polizei Niedersachsen Stadthagen 52.3289688 9.2053496
polizei_suedhe Polizei Südhessen Polizei Hessen Darmstadt 49.872775 8.651177
polizeiswn Polizei Schwaben Nord Polizei Bayern Augsburg 48.3668041 10.8986971
polizeisws Polizei Schwaben S/W Polizei Bayern Kempten (Allgäu) 47.7267063 10.3168835
polizei_sz Polizei SZ / PE / WF Polizei Niedersachsen Salzgitter 52.1503721 10.3593147
polizei_thuer Polizei Thüringen Polizei Thüringen Erfurt 50.9777974 11.0287364
polizeitrier Polizei Trier Polizei Rheinland-Pfalz Trier 49.7596208 6.6441878
polizeiufr Polizei Unterfranken Polizei Bayern Würzburg 49.79245 9.932966
polizeiul Polizei Ulm Polizei Baden-Württemberg Ulm 48.3974003 9.9934336
polizei_ver_ohz Polizei Verden/Osterholz Polizei Niedersachsen Verden 52.922341 9.228153
polizeivg Polizei Vorpommern-Greifswald Polizei Mecklenburg-Vorpommern Anklam 53.8560526 13.688091
polizei_wh Polizei Westhessen Polizei Hessen Wiesbaden 50.0820384 8.2416556
polizei_whv_fri Polizei Wilhelmshaven/Friesland Polizei Niedersachsen Wilhelmshaven 53.5278793 8.106301
polizeiwittlich Polizei Wittlich Polizei Rheinland-Pfalz Wittlich 49.9850353 6.88844
polizei_wl Polizei LK Harburg Polizei Niedersachsen Harburg 53.3172237 9.9084936
polizei_wob Polizei Wolfsburg Polizei Niedersachsen Wolfsburg 52.4205588 10.7861682
polizei_zpd_ni Polizei ZPD NI Polizei Niedersachsen Hannover 52.3744779 9.7385532
pp_rheinpfalz Polizei Rheinpfalz Polizei Rheinland-Pfalz Ludwigshafen am Rhein 49.4704113 8.4381568
pp_stuttgart Polizei Stuttgart Polizei Baden-Württemberg Stuttgart 48.7784485 9.1800132
sh_polizei Polizei SH Polizei Schleswig-Holstein Kiel 54.3227085 10.135555
1 Polizei Account Name Typ Bundesland Stadt LAT LONG
2 bpol_11 Bundespolizei Spezialkräfte Bundespolizei - - -
3 bpol_bepo Bundesbereitschaftspolizei Bundesbereitschaftspolizei - - - -
4 bpol_air_fra Bundespolizei Flughafen Frankfurt am Main Bundespolizei Hessen Frankfurt am Main 50.1109221 8.6821267
5 bpol_b Bundespolizei Berlin Bundespolizei Berlin Berlin 52.520007 13.404954
6 bpol_b_einsatz Bundespolizei Berlin Einsatz Bundespolizei Berlin Berlin 52.520007 13.404954
7 bpol_bw Bundespolizei Baden-Württemberg Bundespolizei Baden-Württemberg Böblingen 48.681331 9.008830
8 bpol_by Bundespolizei Bayern Bundespolizei Bayern München 48.135125 11.581981
9 bpol_koblenz Bundespolizei Koblenz Bundespolizei Rheinland-Pfalz Koblenz 50.356943 7.588996
10 bpol_kueste Bundespolizei Küste Bundespolizei Schleswig-Holstein Bad Bramstedt 53.919582 9.882173
11 bpol_nord Bundespolizei Nord Bundespolizei Niedersachsen Hannover 52.3744779 9.7385532
12 bpol_nrw Bundespolizei NRW Bundespolizei Nordrhein-Westfalen Sankt Augustin 50.769478 7.187579
13 bpol_pir Bundespolizei Mitteldeutschland Bundespolizei Sachsen-Anhalt Magdeburg 52.120533 11.627624
14 bremenpolizei Polizei Bremen Polizei Bremen Bremen 53.079296 8.801694
15 lkabawue Landeskriminalamt BW Landeskriminalamt Baden-Württemberg Stuttgart 48.775846 9.182932
16 lka_bayern Bayerisches Landeskriminalamt Landeskriminalamt Bayern München 48.135125 11.581981
17 lka_hessen Hessisches Landeskriminalamt Landeskriminalamt Hessen Wiesbaden 50.0820384 8.2416556
18 lka_rlp Landeskriminalamt Rheinland-Pfalz Landeskriminalamt Rheinland-Pfalz Mainz 49.992862 8.247253
19 pol_grafschaft Polizei Grf Bentheim Polizei Niedersachsen Nordhorn 52.429580 7.068571
20 polizeiaalen Polizei Aalen Polizei Baden-Württemberg Aalen 48.836689 10.097116
21 polizei_aur_wtm Polizei Aurich / WTM Polizei Niedersachsen Aurich 53.470839 7.484831
22 polizei_badn Polizei Bad Nenndorf Polizei Niedersachsen Bad Nenndorf 52.336191 9.374258
23 polizeibayern Polizei Bayern Polizei Bayern München 48.135125 11.581981
24 polizeibb Polizei Brandenburg Polizei Brandenburg Potsdam 52.390569 13.064473
25 polizeibb_e PolizeiBrandenburg_E Polizei Brandenburg Potsdam 52.390569 13.064473
26 polizei_bbg Polizei Bückeburg Polizei Niedersachsen Bückeburg 52.259276 9.052123
27 polizeiberlin Polizei Berlin Polizei Berlin Berlin 52.520007 13.404954
28 polizeiberlin_e Polizei Berlin Einsatz Polizei Berlin Berlin 52.520007 13.404954
29 polizeibhv Polizei Bremerhaven Polizei Bremen Bremerhaven 53.539584 8.580942
30 polizei_bs Polizei Braunschweig Polizei Niedersachsen Braunschweig 52.268874 10.526770
31 polizei_ce Polizei Celle Polizei Niedersachsen Celle 52.617596 10.062852
32 polizei_clp_vec Polizei Cloppenburg/Vechta Polizei Niedersachsen Cloppenburg 52.844198 8.053016
33 polizei_cux Polizei Cuxhaven Polizei Niedersachsen Cuxhaven 53.859336 8.687906
34 polizei_del Polizei Delmenhorst/Oldenburg-Land/Wesermarsch Polizei Niedersachsen Delmenhorst 53.052189 8.635593
35 polizei_dero Polizei Dessau-Roßlau Polizei Sachsen-Anhalt Dessau-Roßlau 51.842828 12.230393
36 polizei_dh Polizei Diepholz Polizei Niedersachsen Diepholz 52.605646 8.370788
37 polizei_el Polizei Emsland Polizei Niedersachsen Lingen 52.540308 7.329286
38 polizei_ffm Polizei Frankfurt Polizei Hessen Frankfurt am Main 50.110922 8.682127
39 polizeifr Polizei Freiburg Polizei Baden-Württemberg Freiburg 47.999008 7.842104
40 polizei_ft Polizei Frankenthal Polizei Rheinland-Pfalz Frankenthal 49.533333 8.350000
41 polizei_ger Polizei Germersheim Polizei Rheinland-Pfalz Germersheim 49.214024 8.366815
42 polizei_gf Polizei Gifhorn Polizei Niedersachsen Gifhorn 52.480909 10.550783
43 polizei_goe Polizei Göttingen Polizei Niedersachsen Göttingen 51.541280 9.915804
44 polizei_gs Polizei Goslar Polizei Niedersachsen Goslar 51.905953 10.428996
45 polizei_h Polizei Hannover Polizei Niedersachsen Hannover 52.3744779 9.7385532
46 polizei_hal Polizei Halle (Saale) Polizei Sachsen-Anhalt Halle (Saale) 51.4825041 11.9705452
47 polizeihamburg Polizei Hamburg Polizei Hamburg Hamburg 53.550341 10.000654
48 polizei_hi Polizei Hildesheim Polizei Niedersachsen Hildesheim 52.1521636 9.9513046
49 polizei_hk Polizei Heidekreis Polizei Niedersachsen Soltau 52.9859666 9.8433909
50 polizei_hm Polizei Hameln Polizei Niedersachsen Hameln-Pyrmont 52.0895789 9.3875409
51 polizeihn Polizei Heilbronn Polizei Baden-Württemberg Heilbronn 49.142291 9.218655
52 polizei_hol Polizei Holzminden Polizei Niedersachsen Holzminden 51.828835 9.4466591
53 polizei_hst Polizei Stralsund Polizei Mecklenburg-Vorpommern Stralsund 54.3096314 13.0820846
54 polizei_ka Polizei Karlsruhe Polizei Baden-Württemberg Karlsruhe 49.0068705 8.4034195
55 polizei_kl Polizei Kaiserslautern Polizei Rheinland-Pfalz Kaiserslautern 49.4432174 7.7689951
56 polizei_ko Polizei Koblenz Polizei Rheinland-Pfalz Koblenz 50.3533278 7.5943951
57 polizeikonstanz Polizei Konstanz Polizei Baden-Württemberg Konstanz 47.659216 9.1750718
58 polizeilb Polizei Ludwigsburg Polizei Baden-Württemberg Ludwigsburg 48.8953937 9.1895147
59 polizei_ler_emd Polizei Leer / Emden Polizei Niedersachsen Leer 53.2327625 7.4577265
60 polizei_lg Polizei Lüneburg Polizei Niedersachsen Lüneburg 53.248706 10.407855
61 polizeimainz Polizei Mainz Polizei Rheinland-Pfalz Mainz 50.0012314 8.2762513
62 polizeimannheim Polizei Mannheim Polizei Baden-Württemberg Mannheim 49.4892913 8.4673098
63 polizei_md Polizei Magdeburg Polizei Sachsen-Anhalt Magdeburg 52.1315889 11.6399609
64 polizeimfr Polizei Mittelfranken Polizei Bayern Nürnberg 49.453872 11.077298
65 polizei_mh Polizei Mittelhessen Polizei Hessen Gießen 50.5862066 8.6742306
66 polizei_mse Polizei Mecklenburgische Seenplatte Polizei Mecklenburg-Vorpommern Neubrandenburg 53.5574458 13.2602781
67 polizeimuenchen Polizei München Polizei Bayern München 48.135125 11.581981
68 polizeinb Polizei Niederbayern Polizei Bayern Straubing 48.8819801 12.569716
69 polizei_nbg Polizei Nienburg Polizei Niedersachsen Nienburg (Weser) 52.6487602 9.2578105
70 polizeineustadt Polizei Neustadt Polizei Rheinland-Pfalz Neustadt an der Weinstraße 49.3539802 8.1350021
71 polizei_nh Polizei Nordhessen Polizei Hessen Kassel 51.3154546 9.4924096
72 polizeini_lka LKA Niedersachsen Landeskriminalamt Niedersachsen Hannover 52.3744779 9.7385532
73 polizei_nom Polizei Northeim Polizei Niedersachsen Northeim 51.705401 9.9972782
74 polizei_nrw_ac Polizei NRW AC Polizei Nordrhein-Westfalen Aachen 50.776351 6.083862
75 polizei_nrw_bi Polizei NRW BI Polizei Nordrhein-Westfalen Bielefeld 52.0191005 8.531007
76 polizei_nrw_bn Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 7.10066
77 polizei_nrw_bo Polizei NRW BO Polizei Nordrhein-Westfalen Bochum 51.4818111 7.2196635
78 polizei_nrw_bor Polizei NRW BOR Polizei Nordrhein-Westfalen Borken 51.8443183 6.8582247
79 polizei_nrw_coe Polizei NRW COE Polizei Nordrhein-Westfalen Coesfeld 51.9458943 7.1691108
80 polizei_nrw_d Polizei NRW D Polizei Nordrhein-Westfalen Düsseldorf 51.2254018 6.7763137
81 polizei_nrw_dn Polizei NRW DN Polizei Nordrhein-Westfalen Düren 50.8031684 6.4820806
82 polizei_nrw_do Polizei NRW DO Polizei Nordrhein-Westfalen Dortmund 51.5142273 7.4652789
83 polizei_nrw_du Polizei NRW DU Polizei Nordrhein-Westfalen Duisburg 51.434999 6.759562
84 polizei_nrw_e Polizei NRW E Polizei Nordrhein-Westfalen Essen 51.4582235 7.0158171
85 polizei_nrw_en Polizei NRW EN Polizei Nordrhein-Westfalen Ennepe-Ruhr-Kreis 51.3481444 7.3351844
86 polizei_nrw_eu Polizei NRW EU Polizei Nordrhein-Westfalen Euskirchen 50.6612623 6.7871219
87 polizei_nrw_ge Polizei NRW GE Polizei Nordrhein-Westfalen Gelsenkirchen 51.5110321 7.0960124
88 polizei_nrw_gm Polizei NRW GM Polizei Nordrhein-Westfalen Gummersbach 51.0277658 7.5630545
89 polizei_nrw_gt Polizei NRW GT Polizei Nordrhein-Westfalen Gütersloh 51.9063997 8.3782078
90 polizei_nrw_ha Polizei NRW HA Polizei Nordrhein-Westfalen Hagen 51.3582945 7.473296
91 polizei_nrw_ham Polizei NRW HAM Polizei Nordrhein-Westfalen Hamm 51.6804093 7.815197
92 polizei_nrw_hf Polizei NRW HF Polizei Nordrhein-Westfalen Herford 52.1152245 8.6711118
93 polizei_nrw_hs Polizei NRW HS Polizei Nordrhein-Westfalen Heinsberg 51.0654268 6.0984461
94 polizei_nrw_hsk Polizei NRW HSK Polizei Nordrhein-Westfalen Hochsauerlandkreis 51.3208247 8.2684925
95 polizei_nrw_hx Polizei NRW HX Polizei Nordrhein-Westfalen Höxter 51.7747369 9.3816877
96 polizei_nrw_k Polizei NRW K Polizei Nordrhein-Westfalen Köln 50.938361 6.959974
97 polizei_nrw_kle Polizei NRW KLE Polizei Nordrhein-Westfalen Kleve 51.7854839 6.1313674
98 polizei_nrw_kr Polizei NRW KR Polizei Nordrhein-Westfalen Krefeld 51.3331205 6.5623343
99 polizei_nrw_lip Polizei NRW LIP Polizei Nordrhein-Westfalen Detmold 51.936284 8.8791526
100 polizei_nrw_lka Polizei NRW LKA Landeskriminalamt Nordrhein-Westfalen Düsseldorf 51.2254018 6.7763137
101 polizei_nrw_me polizei_nrw_me Polizei Nordrhein-Westfalen Mettmann 51.2527778 6.9777778
102 polizei_nrw_mg Polizei NRW MG Polizei Nordrhein-Westfalen Mönchengladbach 51.1946983 6.4353641
103 polizei_nrw_mi Polizei NRW MI Polizei Nordrhein-Westfalen Minden 52.2881045 8.9168852
104 polizei_nrw_mk Polizei NRW MK Polizei Nordrhein-Westfalen Märkischer Kreis 51.2734857 7.7274266
105 polizei_nrw_ms Polizei NRW MS Polizei Nordrhein-Westfalen Münster 51.9625101 7.6251879
106 polizei_nrw_ob Polizei NRW OB Polizei Nordrhein-Westfalen Oberhausen 51.4696137 6.8514435
107 polizei_nrw_oe Polizei NRW OE Polizei Nordrhein-Westfalen Olpe 51.0297603 7.8424193
108 polizei_nrw_pb Polizei NRW PB Polizei Nordrhein-Westfalen Paderborn 51.7189596 8.7648698
109 polizei_nrw_rbk Polizei NRW RBK Polizei Nordrhein-Westfalen Rheinisch-Bergischer-Kreis 51.0139774 7.1715584
110 polizei_nrw_re Polizei NRW RE Polizei Nordrhein-Westfalen Recklinghausen 51.6143815 7.1978546
111 polizei_nrw_rek Polizei NRW REK Polizei Nordrhein-Westfalen Rhein-Erft-Kreis 50.90334 6.763334
112 polizei_nrw_rkn Polizei NRW RKN Polizei Nordrhein-Westfalen Rhein-Kreis Neuss 51.1758799 6.6600606
113 polizei_nrw_si Polizei NRW SI Polizei Nordrhein-Westfalen Siegen-Wittgenstein 50.97444 8.23972
114 polizei_nrw_so Polizei NRW SO Polizei Nordrhein-Westfalen Soest 51.5725501 8.1061259
115 polizei_nrw_st Polizei NRW ST Polizei Nordrhein-Westfalen Steinfurt 52.1294289 7.3903454
116 polizei_nrw_su Polizei NRW SU Polizei Nordrhein-Westfalen Rhein-Sieg-Kreis 50.7527986 7.3813038
117 polizei_nrw_un Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 7.689014
118 polizei_nrw_vie Polizei NRW VIE Polizei Nordrhein-Westfalen Viersen 51.2562118 6.3905476
119 polizei_nrw_w Polizei NRW W Polizei Nordrhein-Westfalen Wuppertal 51.264018 7.1780374
120 polizei_nrw_waf Polizei NRW WAF Polizei Nordrhein-Westfalen Warendorf 51.9532449 7.9912335
121 polizei_nrw_wes Polizei NRW WES Polizei Nordrhein-Westfalen Wesel 51.6576909 6.617087
122 polizeiobn Polizei Oberbayern N Polizei Bayern Ingolstadt 48.7630165 11.4250395
123 polizeiobs PolizeiOberbayernSüd Polizei Bayern Rosenheim 47.8539273 12.127262
124 polizeiofr Polizei Oberfranken Polizei Bayern Oberfranken 50.0553084 11.5455233
125 polizeiog Polizei Offenburg Polizei Baden-Württemberg Offenburg 48.4716556 7.944378
126 polizei_oh Polizei Osthessen Polizei Hessen Fulda 50.5521486 9.676511
127 polizei_oha Polizei Osterode Polizei Niedersachsen Osterode am Harz 51.72784 10.2508204
128 polizei_ol Polizei Oldenburg-Stadt/Ammerland Polizei Niedersachsen Oldenburg 53.1389753 8.2146017
129 polizeiopf Polizei Oberpfalz Polizei Bayern Regensburg 49.0195333 12.0974869
130 polizei_os Polizei Osnabrück Polizei Niedersachsen Osnabrück 52.266837 8.049741
131 polizei_pf Polizei Pforzheim Polizei Baden-Württemberg Pforzheim 48.8908846 8.7029532
132 polizei_pp_nb Polizeipräsidium NB Polizeipräsidium Mecklenburg-Vorpommern Neubrandenburg 53.5574458 13.2602781
133 polizei_pp_ros Polizeipräsidium Rostock Polizeipräsidium Mecklenburg-Vorpommern Rostock 54.0924445 12.1286127
134 polizei_ps Polizei Pirmasens Polizei Rheinland-Pfalz Pirmasens 49.1996961 7.6087847
135 polizei_rostock Polizei Rostock Polizei Mecklenburg-Vorpommern Rostock 54.0924445 12.1286127
136 polizei_row Polizei Rotenburg Polizei Niedersachsen Rotenburg (Wümme) 53.2520924 9.3151133
137 polizeirt Polizei Reutlingen Polizei Baden-Württemberg Reutlingen 48.4919508 9.2114144
138 polizeirv Polizei Ravensburg Polizei Baden-Württemberg Ravensburg 47.7811014 9.612468
139 polizeisaarland Polizei Saarland Polizei Saarland Saarbrücken 49.234362 6.996379
140 polizeisachsen Polizei Sachsen Polizei Sachsen Dresden 51.0493286 13.7381437
141 polizei_sdl Polizei Stendal Polizei Sachsen-Anhalt Stendal 52.6050782 11.8594279
142 polizei_sn Polizei Schwerin Polizei Mecklenburg-Vorpommern Schwerin 53.6288297 11.4148038
143 polizei_soh Polizei Südosthessen Polizei Hessen Offenbach am Main 50.1055002 8.7610698
144 polizei_std Polizei Stade Polizei Niedersachsen Stade 53.599794 9.475438
145 polizei_sth Polizei Stadthagen Polizei Niedersachsen Stadthagen 52.3289688 9.2053496
146 polizei_suedhe Polizei Südhessen Polizei Hessen Darmstadt 49.872775 8.651177
147 polizeiswn Polizei Schwaben Nord Polizei Bayern Augsburg 48.3668041 10.8986971
148 polizeisws Polizei Schwaben S/W Polizei Bayern Kempten (Allgäu) 47.7267063 10.3168835
149 polizei_sz Polizei SZ / PE / WF Polizei Niedersachsen Salzgitter 52.1503721 10.3593147
150 polizei_thuer Polizei Thüringen Polizei Thüringen Erfurt 50.9777974 11.0287364
151 polizeitrier Polizei Trier Polizei Rheinland-Pfalz Trier 49.7596208 6.6441878
152 polizeiufr Polizei Unterfranken Polizei Bayern Würzburg 49.79245 9.932966
153 polizeiul Polizei Ulm Polizei Baden-Württemberg Ulm 48.3974003 9.9934336
154 polizei_ver_ohz Polizei Verden/Osterholz Polizei Niedersachsen Verden 52.922341 9.228153
155 polizeivg Polizei Vorpommern-Greifswald Polizei Mecklenburg-Vorpommern Anklam 53.8560526 13.688091
156 polizei_wh Polizei Westhessen Polizei Hessen Wiesbaden 50.0820384 8.2416556
157 polizei_whv_fri Polizei Wilhelmshaven/Friesland Polizei Niedersachsen Wilhelmshaven 53.5278793 8.106301
158 polizeiwittlich Polizei Wittlich Polizei Rheinland-Pfalz Wittlich 49.9850353 6.88844
159 polizei_wl Polizei LK Harburg Polizei Niedersachsen Harburg 53.3172237 9.9084936
160 polizei_wob Polizei Wolfsburg Polizei Niedersachsen Wolfsburg 52.4205588 10.7861682
161 polizei_zpd_ni Polizei ZPD NI Polizei Niedersachsen Hannover 52.3744779 9.7385532
162 pp_rheinpfalz Polizei Rheinpfalz Polizei Rheinland-Pfalz Ludwigshafen am Rhein 49.4704113 8.4381568
163 pp_stuttgart Polizei Stuttgart Polizei Baden-Württemberg Stuttgart 48.7784485 9.1800132
164 sh_polizei Polizei SH Polizei Schleswig-Holstein Kiel 54.3227085 10.135555

View file

@ -0,0 +1,39 @@
import pandas as pd
import spacy
from string import punctuation
from tqdm import tqdm
tqdm.pandas()
tw_tweets = pd.read_csv(r'data\copbird_table_tweet_ext_state.csv')
nlp = spacy.load('de_core_news_lg')
def clean_tweet(txt):
doc = nlp(txt)
token_list = []
for token in doc:
if (token.text not in punctuation) and (token.is_stop is False):
token_list.append(token.lemma_)
else:
pass
return ' '.join(token_list)
def get_topics_by_str_lst(topic, df, col_name):
df_topiced = df[df[col_name].str.contains('|'.join(topic))]
return df_topiced
if __name__ == '__main__':
topic_1 = ['demonstr', 'kundgeb']
topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']
topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']
topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']
df_pm = pd.read_csv(r'data\2020-12_2021-05_presseportal.csv', na_filter=False)
df_pm_col = 'content'
print(get_topics_by_str_lst(topic=topic_3, df=df_pm, col_name=df_pm_col).to_markdown())

View file

@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "7158ac22",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pyLDAvis.sklearn\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import pandas as pd\n",
"import spacy\n",
"from multiprocess import Pool"
]
},
{
"cell_type": "markdown",
"id": "69f33a46",
"metadata": {},
"source": [
"Funktionen zur Vorverarbeitung"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1c66c06c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def filterSentencesByMinWordCount(text, minWordCount):\n",
" sentenceList = []\n",
" doc = nlp(text)\n",
" for sent in doc.sents:\n",
" wordList = []\n",
" sent.text.rstrip()\n",
" for word in sent:\n",
" wordList.append(word)\n",
" if len(wordList) >= minWordCount:\n",
" sentenceList.append(sent.text.rstrip())\n",
" return sentenceList"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3b9d084d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def extractSentences(document):\n",
" logging.debug('Extracting Sentences')\n",
" text = extractBodyContent(document)\n",
" sentenceList = filterSentencesByMinWordCount(text, 4)\n",
" return sentenceList"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7d85891e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def tokenizeSentence(doc):\n",
" logging.debug('Tokenizing')\n",
" tokenList = []\n",
" for token in doc:\n",
" childrenList = []\n",
" for child in token.children:\n",
" childToken = ScToken(child.text,\n",
" child.lemma_,\n",
" child.pos_, str(spacy.explain(child.pos_)),\n",
" child.tag_, str(spacy.explain(child.tag_)),\n",
" child.dep_, str(spacy.explain(child.dep_)),\n",
" child.shape_, child.is_alpha, child.is_stop)\n",
" childrenList.append(childToken)\n",
"\n",
" scToken = ScToken(token.text,\n",
" token.lemma_,\n",
" token.pos_, str(spacy.explain(token.pos_)),\n",
" token.tag_, str(spacy.explain(token.tag_)),\n",
" token.dep_, str(spacy.explain(token.dep_)),\n",
" token.shape_, token.is_alpha, token.is_stop,\n",
" childrenList)\n",
" tokenList.append(scToken)\n",
" return tokenList"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7564c883",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def chunkSentence(doc):\n",
" logging.debug('Chunking')\n",
" chunkList = []\n",
" for chunk in doc.noun_chunks:\n",
" scChunk = ScChunk(chunk.text, chunk.root.text,\n",
" chunk.root.dep_, chunk.root.head.text)\n",
" chunkList.append(scChunk)\n",
" return chunkList"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "5db74302",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def findEntitiesInSentence(doc):\n",
" logging.debug('Extracting Named Entities')\n",
" entityList = []\n",
" for ent in doc.ents:\n",
" entity = ScEntity(ent.text, ent.start_char, ent.end_char,\n",
" ent.label_, str(spacy.explain(ent.label_)))\n",
" entityList.append(entity)\n",
" return entityList"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b6753a90",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def fillSentences(document):\n",
" logging.info(\n",
" 'Building Sentences (Tokenizing, Chunking, Named Entity Recognition)')\n",
" sentenceList = []\n",
" sentences = extractSentences(document)\n",
" for i, sentence in enumerate(sentences):\n",
" doc = nlp(sentence)\n",
" id = i\n",
" tokens = tokenizeSentence(doc)\n",
" chunks = chunkSentence(doc)\n",
" entities = findEntitiesInSentence(doc)\n",
" scSentence = ScSentence(id, sentence, tokens, chunks, entities)\n",
" sentenceList.append(scSentence)\n",
"\n",
" return sentenceList"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9af7a5c0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def fillSentencesAsOneString(sentences):\n",
" sentencesAsOneString = str()\n",
" for sentence in sentences:\n",
" sentencesAsOneString += sentence.text\n",
" return sentencesAsOneString"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8f952c82",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def removeStopwords(text):\n",
" doc = nlp(text)\n",
" tokens = tokenizeSentence(doc)\n",
" chunksNoStopwords = [\n",
" t.text for t in tokens if (not t.isStopword)]\n",
" return \" \".join(chunksNoStopwords)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "28910141",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"ename": "NameError",
"evalue": "name 'ScToken' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-14-7209b5cec518>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mnumberOfStopwords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mScToken\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mcount\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misStopword\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mcount\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'ScToken' is not defined"
]
}
],
"source": [
"def numberOfStopwords(tokens: [ScToken]):\n",
" count = 0\n",
" for t in tokens:\n",
" if t.isStopword:\n",
" count += 1\n",
" return count"
]
},
{
"cell_type": "markdown",
"id": "ce7cc9c1",
"metadata": {},
"source": [
"## Analyse"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1adb09b7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
" tweet_csv = '../data/copbird_table_tweet.csv'"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c0936ecb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"NUM_TOPICS=10\n",
"NUM_FEATURES=1000\n",
"NUM_TOP_WORDS=25"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "dbf0281f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"def get_tweets(path, limit=None):\n",
" df_csv = pd.read_csv(path, nrows=limit, parse_dates=['created_at'],\n",
" encoding='utf-8-sig')\n",
"\n",
" df_csv.drop(columns=['created_at', 'like_count', 'retweet_count', 'reply_count', 'quote_count'], inplace=True)\n",
"\n",
" nlp = spacy.load(\"de_core_news_lg\")\n",
" nlp.Defaults.stop_words |= {\"&amp\", \"amp\"}\n",
" nlp.add_pipe('emoji', first=True)\n",
" return list(\n",
" nlp.pipe(df_csv['tweet_text'], disable=[\"tok2vec\", \"tagger\", \"parser\", \"attribute_ruler\"], n_process=-1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c39f658",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "copbird-env",
"language": "python",
"name": "copbird-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,490 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "cce66876",
"metadata": {},
"source": [
"# Interface Presseportal"
]
},
{
"cell_type": "markdown",
"id": "f12d7022",
"metadata": {},
"source": [
"Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API."
]
},
{
"cell_type": "markdown",
"id": "b07aef9f",
"metadata": {},
"source": [
"Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`"
]
},
{
"cell_type": "markdown",
"id": "258338d0",
"metadata": {},
"source": [
"Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b07fac3c",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import calendar\n",
"import time\n",
"import os\n",
"import csv\n",
"\n",
"from tqdm.notebook import tqdm\n",
"from datetime import datetime\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "markdown",
"id": "0dfce15a",
"metadata": {},
"source": [
"Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6c0b30a8",
"metadata": {},
"outputs": [],
"source": [
"class Pressemitteilung:\n",
" def __init__(self, article_id, timestamp, location, text, bundesland):\n",
" self.article_id = article_id\n",
" self.timestamp = timestamp\n",
" self.location = location\n",
" self.text = text\n",
" self.bundesland=bundesland\n",
" \n",
" def __str__(self):\n",
" return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n",
" \n",
" def to_row(self):\n",
" return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]"
]
},
{
"cell_type": "markdown",
"id": "63cceebe",
"metadata": {},
"source": [
"**Konstanten und Pfade**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8bcc877f",
"metadata": {},
"outputs": [],
"source": [
"REQUEST_HEADERS = {\n",
" \"User-Agent\": (\n",
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n",
" \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n",
" )\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c637ac38",
"metadata": {},
"outputs": [],
"source": [
"DATA_FOLDER = os.path.join(\"..\", \"data\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f094dee0",
"metadata": {},
"outputs": [],
"source": [
"BUNDESLAENDER = [\n",
" \"baden-wuerttemberg\",\n",
" \"bayern\",\n",
" \"berlin-brandenburg\",\n",
" \"bremen\",\n",
" \"hamburg\",\n",
" \"hessen\",\n",
" \"mecklenburg-vorpommern\",\n",
" \"niedersachsen\",\n",
" \"nordrhein-westfalen\",\n",
" \"rheinland-pfalz\",\n",
" \"saarland\",\n",
" \"sachsen\",\n",
" \"sachsen-anhalt\",\n",
" \"schleswig-holstein\",\n",
" \"thueringen\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "84632391",
"metadata": {},
"outputs": [],
"source": [
"def requests_get(request):\n",
" return requests.get(request, headers=REQUEST_HEADERS)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1af0bdbd",
"metadata": {},
"outputs": [],
"source": [
"def extract_response(response, bundesland=None):\n",
" \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n",
" \n",
" Args:\n",
" response (:obj:`Response`)\n",
" bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n",
" \n",
" Returns:\n",
" list of :obj:`Pressemitteilung`\n",
" \"\"\"\n",
" \n",
" mitteilungen = []\n",
" \n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" for article in soup.find_all('article'):\n",
" data_url = article['data-url']\n",
" article_id = '-'.join(article['data-url'].split('/')[-2:])\n",
" meta = article.find('div')\n",
" \n",
" timestamp_str = meta.find(class_=\"date\")\n",
" \n",
" if timestamp_str is not None:\n",
" timestamp_str = timestamp_str.text\n",
" timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y %H:%M')\n",
" else:\n",
" timestamp = None\n",
" \n",
" location_str = meta.find(class_=\"news-topic\")\n",
" location_str = location_str.text if location_str is not None else None\n",
" \n",
" p_texts = article.findAll('p')\n",
" if len(p_texts) > 1:\n",
" text = p_texts[1].text\n",
" else:\n",
" text = ''\n",
" \n",
" mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n",
" \n",
" return mitteilungen"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c62c06c9",
"metadata": {},
"outputs": [],
"source": [
"def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n",
" \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n",
" \n",
" Args:\n",
" site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n",
" location (:obj:`str`, default=None): Bundesland bzw. Stadt\n",
" start_date (:obj:`str`, default=None)\n",
" end_date (:obj:`str`, default=None)\n",
" Returns:\n",
" str: URL\n",
" \"\"\"\n",
" url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n",
" \n",
" if location is not None:\n",
" url += f\"/l/{location}\"\n",
" \n",
" if site > 1:\n",
" url += f\"/{site*30}\"\n",
" \n",
" if start_date is not None or end_date is not None:\n",
" url += \"?\"\n",
" \n",
" if start_date is not None:\n",
" url += f\"startDate={start_date}\"\n",
" \n",
" if end_date is not None:\n",
" url += \"&\"\n",
" \n",
" if end_date is not None:\n",
" url += f\"endDate={end_date}\"\n",
" \n",
" return url"
]
},
{
"cell_type": "markdown",
"id": "1c67c9bc",
"metadata": {},
"source": [
"## Beispiel: Hamburg "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aff924d6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n",
"url"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6e2b9091",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n",
"[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n",
"[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n",
"[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n",
"[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n"
]
}
],
"source": [
"for mitteilung in extract_response(requests_get(url))[:5]:\n",
" print(mitteilung)"
]
},
{
"cell_type": "markdown",
"id": "e50af557",
"metadata": {},
"source": [
"## Effizientes Einlesen"
]
},
{
"cell_type": "markdown",
"id": "b4a9580a",
"metadata": {},
"source": [
"Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "da927e30",
"metadata": {},
"outputs": [],
"source": [
"def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n",
" \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n",
"\n",
" meldungen = []\n",
" site = 1\n",
" \n",
" start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
" end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
" request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n",
" \n",
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
" meldungen.extend(new_meldungen)\n",
" \n",
" pbar = tqdm(desc=bundesland)\n",
" while len(new_meldungen) != 0:\n",
" time.sleep(1)\n",
" site += 1\n",
" \n",
" request = create_get_request(\n",
" site=site, location=bundesland, start_date=start_date, end_date=end_date,\n",
" )\n",
" \n",
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
" meldungen.extend(new_meldungen)\n",
" pbar.update(1)\n",
" pbar.close()\n",
" \n",
" return meldungen"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "85508758",
"metadata": {},
"outputs": [],
"source": [
"def get_meldungen_for_date(year, month, day):\n",
" \"\"\"Extrahiere alle Meldungen für einen Tag\n",
" \n",
" Args:\n",
" year (int): Jahr\n",
" month (int): Monat\n",
" day (int): Tag\n",
" \"\"\"\n",
"\n",
" meldungen_dict = {}\n",
" \n",
" for bundesland in BUNDESLAENDER:\n",
" meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n",
" meldungen_dict[bundesland] = meldungen\n",
" \n",
" return meldungen_dict"
]
},
{
"cell_type": "markdown",
"id": "f938d8a9",
"metadata": {},
"source": [
"## Speichern der Daten in CSV-Dateien"
]
},
{
"cell_type": "markdown",
"id": "67374d3b",
"metadata": {},
"source": [
"Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. "
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "276e700d",
"metadata": {},
"outputs": [],
"source": [
"def store_meldungen_in_csv(year, month, day):\n",
" \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n",
"\n",
" filename = f\"{year}-{month}-{day}_presseportal.csv\"\n",
" path = os.path.join(DATA_FOLDER, filename)\n",
" meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n",
" \n",
" with open(path, 'w', newline='', encoding='UTF8') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n",
" \n",
" for bundesland, meldungen in meldungen_per_bundesland.items():\n",
" for meldung in meldungen:\n",
" writer.writerow(meldung.to_row())\n",
" \n",
" print(f\"File '{filename}' created\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c5d0bdbd",
"metadata": {},
"outputs": [],
"source": [
"def store_month(year, month):\n",
" month_end_day = calendar.monthrange(year, month)[1]\n",
" \n",
" for i in range(0, month_end_day):\n",
" store_meldungen_in_csv(year, month, i+1)"
]
},
{
"cell_type": "markdown",
"id": "d9f3e24b",
"metadata": {},
"source": [
"## Auswertung: Wie viele Einträge pro Bundesland?"
]
},
{
"cell_type": "markdown",
"id": "9f600d3c",
"metadata": {},
"source": [
"Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "b7c85078",
"metadata": {},
"outputs": [],
"source": [
"counter = {}\n",
"\n",
"for filename in os.listdir('../data/'):\n",
" if filename.endswith(\"_presseportal.csv\"):\n",
" path = '../data/' + filename\n",
" \n",
" with open(path, 'r', encoding='UTF8') as f_in:\n",
" reader = csv.reader(f_in)\n",
" next(reader)\n",
" for row in reader:\n",
" bundesland = row[3]\n",
" if bundesland not in counter:\n",
" counter[bundesland] = 1\n",
" else:\n",
" counter[bundesland] += 1\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "copbird-env",
"language": "python",
"name": "copbird-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,490 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "cce66876",
"metadata": {},
"source": [
"# Interface Presseportal"
]
},
{
"cell_type": "markdown",
"id": "f12d7022",
"metadata": {},
"source": [
"Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API."
]
},
{
"cell_type": "markdown",
"id": "b07aef9f",
"metadata": {},
"source": [
"Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`"
]
},
{
"cell_type": "markdown",
"id": "258338d0",
"metadata": {},
"source": [
"Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b07fac3c",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import calendar\n",
"import time\n",
"import os\n",
"import csv\n",
"\n",
"from tqdm.notebook import tqdm\n",
"from datetime import datetime\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "markdown",
"id": "0dfce15a",
"metadata": {},
"source": [
"Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6c0b30a8",
"metadata": {},
"outputs": [],
"source": [
"class Pressemitteilung:\n",
" def __init__(self, article_id, timestamp, location, text, bundesland):\n",
" self.article_id = article_id\n",
" self.timestamp = timestamp\n",
" self.location = location\n",
" self.text = text\n",
" self.bundesland=bundesland\n",
" \n",
" def __str__(self):\n",
" return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n",
" \n",
" def to_row(self):\n",
" return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]"
]
},
{
"cell_type": "markdown",
"id": "63cceebe",
"metadata": {},
"source": [
"**Konstanten und Pfade**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8bcc877f",
"metadata": {},
"outputs": [],
"source": [
"REQUEST_HEADERS = {\n",
" \"User-Agent\": (\n",
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n",
" \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n",
" )\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c637ac38",
"metadata": {},
"outputs": [],
"source": [
"DATA_FOLDER = os.path.join(\"..\", \"data\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f094dee0",
"metadata": {},
"outputs": [],
"source": [
"BUNDESLAENDER = [\n",
" \"baden-wuerttemberg\",\n",
" \"bayern\",\n",
" \"berlin-brandenburg\",\n",
" \"bremen\",\n",
" \"hamburg\",\n",
" \"hessen\",\n",
" \"mecklenburg-vorpommern\",\n",
" \"niedersachsen\",\n",
" \"nordrhein-westfalen\",\n",
" \"rheinland-pfalz\",\n",
" \"saarland\",\n",
" \"sachsen\",\n",
" \"sachsen-anhalt\",\n",
" \"schleswig-holstein\",\n",
" \"thueringen\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "84632391",
"metadata": {},
"outputs": [],
"source": [
"def requests_get(request):\n",
" return requests.get(request, headers=REQUEST_HEADERS)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1af0bdbd",
"metadata": {},
"outputs": [],
"source": [
"def extract_response(response, bundesland=None):\n",
" \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n",
" \n",
" Args:\n",
" response (:obj:`Response`)\n",
" bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n",
" \n",
" Returns:\n",
" list of :obj:`Pressemitteilung`\n",
" \"\"\"\n",
" \n",
" mitteilungen = []\n",
" \n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" for article in soup.find_all('article'):\n",
" data_url = article['data-url']\n",
" article_id = '-'.join(article['data-url'].split('/')[-2:])\n",
" meta = article.find('div')\n",
" \n",
" timestamp_str = meta.find(class_=\"date\")\n",
" \n",
" if timestamp_str is not None:\n",
" timestamp_str = timestamp_str.text\n",
" timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y %H:%M')\n",
" else:\n",
" timestamp = None\n",
" \n",
" location_str = meta.find(class_=\"news-topic\")\n",
" location_str = location_str.text if location_str is not None else None\n",
" \n",
" p_texts = article.findAll('p')\n",
" if len(p_texts) > 1:\n",
" text = p_texts[1].text\n",
" else:\n",
" text = ''\n",
" \n",
" mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n",
" \n",
" return mitteilungen"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c62c06c9",
"metadata": {},
"outputs": [],
"source": [
"def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n",
" \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n",
" \n",
" Args:\n",
" site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n",
" location (:obj:`str`, default=None): Bundesland bzw. Stadt\n",
" start_date (:obj:`str`, default=None)\n",
" end_date (:obj:`str`, default=None)\n",
" Returns:\n",
" str: URL\n",
" \"\"\"\n",
" url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n",
" \n",
" if location is not None:\n",
" url += f\"/l/{location}\"\n",
" \n",
" if site > 1:\n",
" url += f\"/{site*30}\"\n",
" \n",
" if start_date is not None or end_date is not None:\n",
" url += \"?\"\n",
" \n",
" if start_date is not None:\n",
" url += f\"startDate={start_date}\"\n",
" \n",
" if end_date is not None:\n",
" url += \"&\"\n",
" \n",
" if end_date is not None:\n",
" url += f\"endDate={end_date}\"\n",
" \n",
" return url"
]
},
{
"cell_type": "markdown",
"id": "1c67c9bc",
"metadata": {},
"source": [
"## Beispiel: Hamburg "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aff924d6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n",
"url"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6e2b9091",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n",
"[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n",
"[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n",
"[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n",
"[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n"
]
}
],
"source": [
"for mitteilung in extract_response(requests_get(url))[:5]:\n",
" print(mitteilung)"
]
},
{
"cell_type": "markdown",
"id": "e50af557",
"metadata": {},
"source": [
"## Effizientes Einlesen"
]
},
{
"cell_type": "markdown",
"id": "b4a9580a",
"metadata": {},
"source": [
"Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "da927e30",
"metadata": {},
"outputs": [],
"source": [
"def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n",
" \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n",
"\n",
" meldungen = []\n",
" site = 1\n",
" \n",
" start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
" end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
" request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n",
" \n",
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
" meldungen.extend(new_meldungen)\n",
" \n",
" pbar = tqdm(desc=bundesland)\n",
" while len(new_meldungen) != 0:\n",
" time.sleep(1)\n",
" site += 1\n",
" \n",
" request = create_get_request(\n",
" site=site, location=bundesland, start_date=start_date, end_date=end_date,\n",
" )\n",
" \n",
" new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
" meldungen.extend(new_meldungen)\n",
" pbar.update(1)\n",
" pbar.close()\n",
" \n",
" return meldungen"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "85508758",
"metadata": {},
"outputs": [],
"source": [
"def get_meldungen_for_date(year, month, day):\n",
" \"\"\"Extrahiere alle Meldungen für einen Tag\n",
" \n",
" Args:\n",
" year (int): Jahr\n",
" month (int): Monat\n",
" day (int): Tag\n",
" \"\"\"\n",
"\n",
" meldungen_dict = {}\n",
" \n",
" for bundesland in BUNDESLAENDER:\n",
" meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n",
" meldungen_dict[bundesland] = meldungen\n",
" \n",
" return meldungen_dict"
]
},
{
"cell_type": "markdown",
"id": "f938d8a9",
"metadata": {},
"source": [
"## Speichern der Daten in CSV-Dateien"
]
},
{
"cell_type": "markdown",
"id": "67374d3b",
"metadata": {},
"source": [
"Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. "
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "276e700d",
"metadata": {},
"outputs": [],
"source": [
"def store_meldungen_in_csv(year, month, day):\n",
" \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n",
"\n",
" filename = f\"{year}-{month}-{day}_presseportal.csv\"\n",
" path = os.path.join(DATA_FOLDER, filename)\n",
" meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n",
" \n",
" with open(path, 'w', newline='', encoding='UTF8') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n",
" \n",
" for bundesland, meldungen in meldungen_per_bundesland.items():\n",
" for meldung in meldungen:\n",
" writer.writerow(meldung.to_row())\n",
" \n",
" print(f\"File '{filename}' created\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c5d0bdbd",
"metadata": {},
"outputs": [],
"source": [
"def store_month(year, month):\n",
" month_end_day = calendar.monthrange(year, month)[1]\n",
" \n",
" for i in range(0, month_end_day):\n",
" store_meldungen_in_csv(year, month, i+1)"
]
},
{
"cell_type": "markdown",
"id": "d9f3e24b",
"metadata": {},
"source": [
"## Auswertung: Wie viele Einträge pro Bundesland?"
]
},
{
"cell_type": "markdown",
"id": "9f600d3c",
"metadata": {},
"source": [
"Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "b7c85078",
"metadata": {},
"outputs": [],
"source": [
"counter = {}\n",
"\n",
"for filename in os.listdir('../data/'):\n",
" if filename.endswith(\"_presseportal.csv\"):\n",
" path = '../data/' + filename\n",
" \n",
" with open(path, 'r', encoding='UTF8') as f_in:\n",
" reader = csv.reader(f_in)\n",
" next(reader)\n",
" for row in reader:\n",
" bundesland = row[3]\n",
" if bundesland not in counter:\n",
" counter[bundesland] = 1\n",
" else:\n",
" counter[bundesland] += 1\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python-scientific kernel",
"language": "python",
"name": "python-scientific"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,40 @@
"""
Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding
police station (user_id; name; handle)
"""
import pandas as pd
from match_blaulich_tw_accounts import extend_blaulicht_data
tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv')
tw_user_data = pd.read_csv(r'data\copbird_table_user.csv')
tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t')
def get_tweets_by_user_id():
tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename(
columns={"id": "tweet_id"})
grouped_tweets = tweet_ids_user_ids.groupby('user_id')
return grouped_tweets
def add_state_to_user_df():
tw_user_df = tw_user_data.rename(columns={"id": "user_id"})
tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"})
return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left')
def add_state_to_tweets_df():
tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle'
]], on='user_id', how='left')
return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland'
]].rename(columns={'id': 'tweet_id', 'name': 'user_name'})
def save_to_csv(df: pd, file_name: str):
df.to_csv(path_or_buf=f'{file_name}.csv', index=False)
if __name__ == '__main__':
save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal')

View file

@ -0,0 +1,44 @@
import pandas as pd
from os import listdir
from os.path import join, isdir
df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'})
dir_blaulicht = 'data/presseportal'
def concat_blaulicht_dfs():
df = pd.DataFrame()
for dir in listdir(dir_blaulicht):
dir = join(dir_blaulicht, dir)
if isdir(dir):
for f in listdir(dir):
f = join(dir, f)
csv = pd.read_csv(f)
df = df.append(csv)
return df
def extend_blaulicht_data():
df_blaulicht = concat_blaulicht_dfs()
mapping = map_bl_tw_citys()
df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping))
return df_blaulicht
def find_location(txt, mp):
mapped_blaulicht = mp.get(txt, "")
return mapped_blaulicht[1] if mapped_blaulicht != "" else ""
def map_bl_tw_citys():
import re
df_blaulicht = concat_blaulicht_dfs()
df_blaulicht.sort_index(inplace=True)
tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None))
tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1]
bl_locations = list(set([str(city) for city in df_blaulicht['location'].values]))
bl_tw_locations = {}
for bl_loc in bl_locations:
for tw_loc, tw_id in tw_locations:
if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()):
bl_tw_locations[bl_loc] = [tw_loc, tw_id]
return bl_tw_locations
if __name__ == '__main__':
extend_blaulicht_data()

View file

@ -0,0 +1,40 @@
"""
Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding
police station (user_id; name; handle)
"""
import pandas as pd
from match_blaulich_tw_accounts import extend_blaulicht_data
tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv')
tw_user_data = pd.read_csv(r'data\copbird_table_user.csv')
tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t')
def get_tweets_by_user_id():
tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename(
columns={"id": "tweet_id"})
grouped_tweets = tweet_ids_user_ids.groupby('user_id')
return grouped_tweets
def add_state_to_user_df():
tw_user_df = tw_user_data.rename(columns={"id": "user_id"})
tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"})
return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left')
def add_state_to_tweets_df():
tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle'
]], on='user_id', how='left')
return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland'
]].rename(columns={'id': 'tweet_id', 'name': 'user_name'})
def save_to_csv(df: pd, file_name: str):
df.to_csv(path_or_buf=f'{file_name}.csv', index=False)
if __name__ == '__main__':
save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal')

View file

@ -0,0 +1,44 @@
import pandas as pd
from os import listdir
from os.path import join, isdir
df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'})
dir_blaulicht = 'data/presseportal'
def concat_blaulicht_dfs():
df = pd.DataFrame()
for dir in listdir(dir_blaulicht):
dir = join(dir_blaulicht, dir)
if isdir(dir):
for f in listdir(dir):
f = join(dir, f)
csv = pd.read_csv(f)
df = df.append(csv)
return df
def extend_blaulicht_data():
df_blaulicht = concat_blaulicht_dfs()
mapping = map_bl_tw_citys()
df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping))
return df_blaulicht
def find_location(txt, mp):
mapped_blaulicht = mp.get(txt, "")
return mapped_blaulicht[1] if mapped_blaulicht != "" else ""
def map_bl_tw_citys():
import re
df_blaulicht = concat_blaulicht_dfs()
df_blaulicht.sort_index(inplace=True)
tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None))
tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1]
bl_locations = list(set([str(city) for city in df_blaulicht['location'].values]))
bl_tw_locations = {}
for bl_loc in bl_locations:
for tw_loc, tw_id in tw_locations:
if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()):
bl_tw_locations[bl_loc] = [tw_loc, tw_id]
return bl_tw_locations
if __name__ == '__main__':
extend_blaulicht_data()

View file

@ -0,0 +1,512 @@
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_text <- pm$content
pm_text <- pm_text[-which(is.na(pm_text))] # remove missing values
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
content_ber <- rep(NA, nrow(pm))
content_ber <- rep(NA, nrow(pm))
content_ber[which(!is.na(pm$content))] <- pm_text
content_ber[which(!is.na(pm$content))] <- pm_text
pm <- cbind(pm, content_ber)
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
content_ber_satzzeichen <- rep(NA, nrow(pm))
content_ber_satzzeichen <- rep(NA, nrow(pm))
content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
pm <- cbind(pm, content_ber_satzzeichen)
head(pm)
pm_text <- pm_demo$content
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
content_ber <- rep(NA, nrow(pm_demo))
content_ber[which(!is.na(pm_demo$content))] <- pm_text
pm_demo <- cbind(pm_demo, content_ber)
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
head(pm_demo)
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/positive-words.txt"))
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
#require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
#require(plyr)
require(stringr)
scores = lapply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
#require(plyr)
require(stringr)
scores = lapply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
library(plyr)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
scores = lapply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
scores = lapply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
View(score_tw_demo)
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringi)
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_text <- pm$content
pm_text <- pm_text[-which(is.na(pm_text))] # remove missing values
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
content_ber <- rep(NA, nrow(pm))
content_ber[which(!is.na(pm$content))] <- pm_text
pm <- cbind(pm, content_ber)
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
content_ber_satzzeichen <- rep(NA, nrow(pm))
content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
pm <- cbind(pm, content_ber_satzzeichen)
head(pm)
# csvpath <- <your path>
# write_csv(pm, str_c(csvpath, "/pressemeldungen.csv"))
pm_text <- pm_demo$content
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
content_ber <- rep(NA, nrow(pm_demo))
content_ber[which(!is.na(pm_demo$content))] <- pm_text
pm_demo <- cbind(pm_demo, content_ber)
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
head(pm_demo)
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/positive-words.txt"))
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
View(score_tw_demo)
Ciew(score_pm_demo)
View(score_pm_demo)
score_pm_demo$text[3]
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringi)
# Read in data
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
summary(pm)
tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
# tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Oldenburg-Stadt/Ammerl"] <- "Oldenburg"
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Mecklenburgische Seenp"] <- "Neubrandenburg"
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Wilhelmshaven/Frieslan"] <- "Wilhelmshaven"
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Baden-Württember"] <- "Stuttgart"
# tweetXstate$stadt[tweetXstate$user_name == "Landeskriminalamt Rheinland-Pf"] <- "Mainz"
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Mitteldeutschlan"] <- "Pirna"
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Delmenhorst/Oldenburg-"] <- "Delmenhorst"
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Flughafen Frankf"] <- "Frankfurt"
# blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
# users <- read_csv("data/copbird_table_user.csv")
# str(users)
# users$name <- as.factor(users$name)
# users$handle <- as.factor(users$handle)
pm_orte <- pm %>% group_by(bundesland) %>% count(location)
head(pm_orte)
head(pm_orte %>% arrange(desc(n)), n = 20)
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringi)
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)
land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
facet_wrap(~Plattform) +
coord_flip() +
guides(fill = FALSE) +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
coord_flip() +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/positive-words.txt"))
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("data/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
ggplot(score_pm_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()

View file

@ -0,0 +1,216 @@
---
title: "Team 16"
author: "Christian, Simon und Cuca"
date: "23 5 2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Daten einlesen
```{r, message = FALSE}
library(tidyverse)
library(stringi)
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
```
# Scrapen der Pressemeldungen (seit Dezember 2020)
# Zuordnung von Orten der Pressemeldungen und Tweets
```{r}
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
```
# Anzahl Pressemeldungen vs. Tweets
```{r}
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)
land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
facet_wrap(~Plattform) +
coord_flip() +
guides(fill = FALSE) +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
coord_flip() +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
```
# Topic modelling
```{r, message=FALSE}
# library(quanteda)
# library(tidyverse)
# library(topicmodels)
# library(ldatuning)
# library(stm)
# library(wordcloud)
#
# pm <- pm[!is.na(pm$content), ]
# tok <- tokens(pm$content_ber_satzzeichen)
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
# # mydfm.trim
#
# anzahl.themen <- 10
# anzahl.woerter <- 10
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
# lda.modell
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
# topmod
#
# write_csv(topmod, "data/topicmodel.csv")
```
### Auswahl der Keywords
`topic_1 = ['demonstr', 'kundgeb']`
`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
# Sentiment Analyse
```{r}
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/positive-words.txt"))
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
ggplot(score_pm_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
```
```{r}
sessionInfo()
```

View file

@ -0,0 +1,216 @@
---
title: "Team 16"
author: "Christian, Simon und Cuca"
date: "23 5 2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Daten einlesen
```{r, message = FALSE}
library(tidyverse)
library(stringi)
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
```
# Scrapen der Pressemeldungen (seit Dezember 2020)
# Zuordnung von Orten der Pressemeldungen und Tweets
```{r}
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
```
# Anzahl Pressemeldungen vs. Tweets
```{r}
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)
land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
facet_wrap(~Plattform) +
coord_flip() +
guides(fill = FALSE) +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
coord_flip() +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
```
# Topic modelling
```{r, message=FALSE}
# library(quanteda)
# library(tidyverse)
# library(topicmodels)
# library(ldatuning)
# library(stm)
# library(wordcloud)
#
# pm <- pm[!is.na(pm$content), ]
# tok <- tokens(pm$content_ber_satzzeichen)
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
# # mydfm.trim
#
# anzahl.themen <- 10
# anzahl.woerter <- 10
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
# lda.modell
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
# topmod
#
# write_csv(topmod, "data/topicmodel.csv")
```
### Auswahl der Keywords
`topic_1 = ['demonstr', 'kundgeb']`
`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
# Sentiment Analyse
```{r}
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/positive-words.txt"))
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
ggplot(score_pm_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
```
```{r}
sessionInfo()
```

View file

@ -0,0 +1,13 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX

View file

@ -0,0 +1,131 @@
appdirs==1.4.4
argon2-cffi==20.1.0
async-generator==1.10
attrs==21.2.0
backcall==0.2.0
beautifulsoup4==4.9.3
bleach==3.3.0
blis==0.7.4
branca==0.4.2
bs4==0.0.1
catalogue==2.0.4
certifi==2020.12.5
cffi==1.14.5
chardet==4.0.0
click==7.1.2
cssselect==1.1.0
cycler==0.10.0
cymem==2.0.5
decorator==4.4.2
defusedxml==0.7.1
dill==0.3.3
docker==4.4.4
emoji==0.6.0
entrypoints==0.3
fake-useragent==0.1.11
filelock==3.0.12
folium==0.12.1
funcy==1.16
future==0.18.2
germansentiment==1.0.5
huggingface-hub==0.0.8
idna==2.10
ipykernel==5.5.5
ipython==7.23.1
ipython-genutils==0.2.0
ipywidgets==7.6.3
jedi==0.18.0
Jinja2
joblib==1.0.1
jsonpickle==2.0.0
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.12
jupyter-console==6.4.0
jupyter-core==4.7.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
kiwisolver==1.3.1
loguru==0.5.3
lxml==4.6.3
MarkupSafe==2.0.1
matplotlib==3.4.2
matplotlib-inline==0.1.2
mistune==0.8.4
multiprocess==0.70.11.1
murmurhash==1.0.5
nbclient==0.5.3
nbconvert==6.0.7
nbformat==5.1.3
nest-asyncio==1.5.1
networkx==2.5.1
nitter-scraper==0.5.0
notebook==6.4.0
numexpr==2.7.3
numpy==1.20.3
packaging==20.9
pandas==1.2.4
pandocfilters==1.4.3
parse==1.19.0
parso==0.8.2
pathy==0.5.2
pendulum==2.1.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==8.2.0
preshed==3.0.5
prometheus-client==0.10.1
prompt-toolkit==3.0.18
ptyprocess==0.7.0
pycparser==2.20
pydantic==1.7.4
pyee==8.1.0
Pygments==2.9.0
pyLDAvis==3.3.1
pyparsing==2.4.7
pyppeteer==0.2.5
pyquery==1.4.3
pyrsistent==0.17.3
python-dateutil==2.8.1
pytz==2021.1
pytzdata==2020.1
pyvis==0.1.9
pyzmq==22.0.3
qtconsole==5.1.0
QtPy==1.9.0
regex==2021.4.4
requests==2.25.1
requests-html==0.10.0
sacremoses==0.0.45
scikit-learn==0.24.2
scipy==1.6.3
seaborn==0.11.1
Send2Trash==1.5.0
six==1.16.0
sklearn==0.0
smart-open
soupsieve==2.2.1
spacy==3.0.6
spacy-legacy==3.0.5
spacymoji==3.0.1
srsly==2.4.1
terminado==0.10.0
testpath==0.5.0
thinc==8.0.3
threadpoolctl==2.1.0
tokenizers==0.10.2
torch==1.8.1
tornado==6.1
tqdm==4.60.0
traitlets==5.0.5
transformers==4.6.0
typer==0.3.2
typing-extensions==3.10.0.0
urllib3==1.26.4
w3lib==1.22.0
wasabi==0.8.2
wcwidth==0.2.5
webencodings==0.5.1
websocket-client==1.0.0
websockets==8.1
widgetsnbextension==3.5.1