init

2023-03-26 18:36:49 +02:00 · 2023-03-26 18:36:49 +02:00 · 8d3c8b3974
commit 8d3c8b3974
77 changed files with 682928 additions and 0 deletions
--- a/ergebnisse_hackathon_repo/team-16/Presse-vs.-Twitter.pdf
+++ b/ergebnisse_hackathon_repo/team-16/Presse-vs.-Twitter.pdf
--- a/ergebnisse_hackathon_repo/team-16/README.md
+++ b/ergebnisse_hackathon_repo/team-16/README.md
@ -0,0 +1,20 @@
+# CopBird Group 16
+
+## Tools
+
+* [Python](https://www.python.org/) Version >= 3.8
+* [Wekan](https://wekan.github.io/) als Arbeitsgrundlage mit einem Gruppen-Wekan-Board sowie Gesamt-Boards
+* [Matrix](https://matrix.org/) zur Kommunikation
+
+Jupyter Notebook verwendet Kernels. Um ein Environment als Kernel zu verwenden, gibt es folgende
+Anleitung: https://queirozf.com/entries/jupyter-kernels-how-to-add-change-remove
+
+## Daten
+
+Die Tweets können nicht öffentlich gemacht werden, jedoch sind die Pressemitteilungen und Sentiment-Wörter unter data/ zu finden.
+
+## Ergebnisse
+
+Die wichtigsten Ergebnisse befinden sich in der [Presse-vs.-Twitter](Presse-vs.-Twitter.pdf) PDF.
+In den notebooks finden sich zusätzliche Details zur Datenextraktion und Analyse.
+
--- a/ergebnisse_hackathon_repo/team-16/data/.gitkeep
+++ b/ergebnisse_hackathon_repo/team-16/data/.gitkeep
--- a/ergebnisse_hackathon_repo/team-16/data/SentiWS.zip
+++ b/ergebnisse_hackathon_repo/team-16/data/SentiWS.zip
--- a/ergebnisse_hackathon_repo/team-16/geolocations/polizei_accounts_geo.csv
+++ b/ergebnisse_hackathon_repo/team-16/geolocations/polizei_accounts_geo.csv
@ -0,0 +1,164 @@
+Polizei Account	Name	Typ	Bundesland	Stadt	LAT	LONG
+bpol_11 	Bundespolizei Spezialkräfte	Bundespolizei	-	-	-	
+bpol_bepo 	Bundesbereitschaftspolizei	Bundesbereitschaftspolizei	-	-	-	-
+bpol_air_fra 	Bundespolizei Flughafen Frankfurt am Main	Bundespolizei	Hessen	Frankfurt am Main	50.1109221	8.6821267
+bpol_b 	Bundespolizei Berlin	Bundespolizei	Berlin	Berlin	52.520007	13.404954
+bpol_b_einsatz 	Bundespolizei Berlin Einsatz	Bundespolizei	Berlin	Berlin	52.520007	13.404954
+bpol_bw 	Bundespolizei Baden-Württemberg	Bundespolizei	Baden-Württemberg	Böblingen	48.681331	9.008830
+bpol_by 	Bundespolizei Bayern	Bundespolizei	Bayern	München	48.135125	11.581981
+bpol_koblenz 	Bundespolizei Koblenz	Bundespolizei	Rheinland-Pfalz	Koblenz	50.356943	7.588996
+bpol_kueste 	Bundespolizei Küste	Bundespolizei	Schleswig-Holstein	Bad Bramstedt	53.919582	9.882173
+bpol_nord 	Bundespolizei Nord	Bundespolizei	Niedersachsen	Hannover	52.3744779	9.7385532
+bpol_nrw 	Bundespolizei NRW	Bundespolizei	Nordrhein-Westfalen	Sankt Augustin	50.769478	7.187579
+bpol_pir 	Bundespolizei Mitteldeutschland	Bundespolizei	Sachsen-Anhalt	Magdeburg	52.120533	11.627624
+bremenpolizei 	Polizei Bremen	Polizei	Bremen	Bremen	53.079296	8.801694
+lkabawue	Landeskriminalamt BW	Landeskriminalamt	Baden-Württemberg	Stuttgart	48.775846	9.182932
+lka_bayern	Bayerisches Landeskriminalamt	Landeskriminalamt	Bayern	München	48.135125	11.581981
+lka_hessen 	Hessisches Landeskriminalamt	Landeskriminalamt	Hessen	Wiesbaden	50.0820384	8.2416556
+lka_rlp 	Landeskriminalamt Rheinland-Pfalz	Landeskriminalamt	Rheinland-Pfalz	Mainz	49.992862	8.247253
+pol_grafschaft 	Polizei Grf Bentheim	Polizei	Niedersachsen	Nordhorn	52.429580	7.068571
+polizeiaalen 	Polizei Aalen	Polizei	Baden-Württemberg	Aalen	48.836689	10.097116
+polizei_aur_wtm 	Polizei Aurich / WTM	Polizei	Niedersachsen	Aurich	53.470839	7.484831
+polizei_badn 	Polizei Bad Nenndorf	Polizei	Niedersachsen	Bad Nenndorf	52.336191	9.374258
+polizeibayern 	Polizei Bayern	Polizei	Bayern	München	48.135125	11.581981
+polizeibb 	Polizei Brandenburg	Polizei	Brandenburg	Potsdam	52.390569	13.064473
+polizeibb_e 	PolizeiBrandenburg_E	Polizei	Brandenburg	Potsdam	52.390569	13.064473
+polizei_bbg 	Polizei Bückeburg	Polizei	Niedersachsen	Bückeburg	52.259276	9.052123
+polizeiberlin 	Polizei Berlin	Polizei	Berlin	Berlin	52.520007	13.404954
+polizeiberlin_e 	Polizei Berlin Einsatz	Polizei	Berlin	Berlin	52.520007	13.404954
+polizeibhv 	Polizei Bremerhaven	Polizei	Bremen	Bremerhaven	53.539584	8.580942
+polizei_bs 	Polizei Braunschweig	Polizei	Niedersachsen	Braunschweig	52.268874	10.526770
+polizei_ce 	Polizei Celle	Polizei	Niedersachsen	Celle	52.617596	10.062852
+polizei_clp_vec 	Polizei Cloppenburg/Vechta	Polizei	Niedersachsen	Cloppenburg	52.844198	8.053016
+polizei_cux 	Polizei Cuxhaven	Polizei	Niedersachsen	Cuxhaven	53.859336	8.687906
+polizei_del 	Polizei Delmenhorst/Oldenburg-Land/Wesermarsch	Polizei	Niedersachsen	Delmenhorst	53.052189	8.635593
+polizei_dero 	Polizei Dessau-Roßlau	Polizei	Sachsen-Anhalt	Dessau-Roßlau	51.842828	12.230393
+polizei_dh 	Polizei Diepholz	Polizei	Niedersachsen	Diepholz	52.605646	8.370788
+polizei_el 	Polizei Emsland	Polizei	Niedersachsen	Lingen	52.540308	7.329286
+polizei_ffm 	Polizei Frankfurt	Polizei	Hessen	Frankfurt am Main	50.110922	8.682127
+polizeifr 	Polizei Freiburg	Polizei	Baden-Württemberg	Freiburg	47.999008	7.842104
+polizei_ft 	Polizei Frankenthal	Polizei	Rheinland-Pfalz	Frankenthal 	49.533333	8.350000
+polizei_ger 	Polizei Germersheim	Polizei	Rheinland-Pfalz	Germersheim	49.214024	8.366815
+polizei_gf 	Polizei Gifhorn	Polizei	Niedersachsen	Gifhorn	52.480909	10.550783
+polizei_goe 	Polizei Göttingen	Polizei	Niedersachsen	Göttingen	51.541280	9.915804
+polizei_gs 	Polizei Goslar	Polizei	Niedersachsen	Goslar	51.905953	10.428996
+polizei_h	Polizei Hannover	Polizei	Niedersachsen	Hannover	52.3744779	9.7385532
+polizei_hal	Polizei Halle (Saale)	Polizei	Sachsen-Anhalt	Halle (Saale)	51.4825041	11.9705452
+polizeihamburg	Polizei Hamburg	Polizei	Hamburg	Hamburg	53.550341	10.000654
+polizei_hi	Polizei Hildesheim	Polizei	Niedersachsen	Hildesheim	52.1521636	9.9513046
+polizei_hk	Polizei Heidekreis	Polizei	Niedersachsen	Soltau	52.9859666	9.8433909
+polizei_hm	Polizei Hameln	Polizei	Niedersachsen	Hameln-Pyrmont	52.0895789	9.3875409
+polizeihn	Polizei Heilbronn	Polizei	Baden-Württemberg	Heilbronn	49.142291	9.218655
+polizei_hol	Polizei Holzminden	Polizei	Niedersachsen	Holzminden	51.828835	9.4466591
+polizei_hst	Polizei Stralsund	Polizei	Mecklenburg-Vorpommern	Stralsund	54.3096314	13.0820846
+polizei_ka	Polizei Karlsruhe	Polizei	Baden-Württemberg	Karlsruhe	49.0068705	8.4034195
+polizei_kl	Polizei Kaiserslautern	Polizei	Rheinland-Pfalz	Kaiserslautern	49.4432174	7.7689951
+polizei_ko	Polizei Koblenz	Polizei	Rheinland-Pfalz	Koblenz	50.3533278	7.5943951
+polizeikonstanz	Polizei Konstanz	Polizei	Baden-Württemberg	Konstanz	47.659216	9.1750718
+polizeilb	Polizei Ludwigsburg	Polizei	Baden-Württemberg	Ludwigsburg	48.8953937	9.1895147
+polizei_ler_emd	Polizei Leer / Emden	Polizei	Niedersachsen	Leer	53.2327625	7.4577265
+polizei_lg	Polizei Lüneburg	Polizei	Niedersachsen	Lüneburg	53.248706	10.407855
+polizeimainz	Polizei Mainz	Polizei	Rheinland-Pfalz	Mainz	50.0012314	8.2762513
+polizeimannheim	Polizei Mannheim	Polizei	Baden-Württemberg	Mannheim	49.4892913	8.4673098
+polizei_md	Polizei Magdeburg	Polizei	Sachsen-Anhalt	Magdeburg	52.1315889	11.6399609
+polizeimfr	Polizei Mittelfranken	Polizei	Bayern	Nürnberg	49.453872	11.077298
+polizei_mh	Polizei Mittelhessen	Polizei	Hessen	Gießen	50.5862066	8.6742306
+polizei_mse	Polizei Mecklenburgische Seenplatte	Polizei	Mecklenburg-Vorpommern	Neubrandenburg	53.5574458	13.2602781
+polizeimuenchen	Polizei München	Polizei	Bayern	München	48.135125	11.581981
+polizeinb	Polizei Niederbayern	Polizei	Bayern	Straubing	48.8819801	12.569716
+polizei_nbg	Polizei Nienburg	Polizei	Niedersachsen	Nienburg (Weser)	52.6487602	9.2578105
+polizeineustadt	Polizei Neustadt	Polizei	Rheinland-Pfalz	Neustadt an der Weinstraße	49.3539802	8.1350021
+polizei_nh	Polizei Nordhessen	Polizei	Hessen	Kassel	51.3154546	9.4924096
+polizeini_lka	LKA Niedersachsen	Landeskriminalamt	Niedersachsen	Hannover	52.3744779	9.7385532
+polizei_nom	Polizei Northeim	Polizei	Niedersachsen	Northeim	51.705401	9.9972782
+polizei_nrw_ac	Polizei NRW AC	Polizei	Nordrhein-Westfalen	Aachen	50.776351	6.083862
+polizei_nrw_bi	Polizei NRW BI	Polizei	Nordrhein-Westfalen	Bielefeld	52.0191005	8.531007
+polizei_nrw_bn	Polizei NRW BN	Polizei	Nordrhein-Westfalen	Bonn	50.735851	7.10066
+polizei_nrw_bo	Polizei NRW BO	Polizei	Nordrhein-Westfalen	Bochum	51.4818111	7.2196635
+polizei_nrw_bor	Polizei NRW BOR	Polizei	Nordrhein-Westfalen	Borken	51.8443183	6.8582247
+polizei_nrw_coe	Polizei NRW COE	Polizei	Nordrhein-Westfalen	Coesfeld	51.9458943	7.1691108
+polizei_nrw_d	Polizei NRW D	Polizei	Nordrhein-Westfalen	Düsseldorf	51.2254018	6.7763137
+polizei_nrw_dn	Polizei NRW DN	Polizei	Nordrhein-Westfalen	Düren	50.8031684	6.4820806
+polizei_nrw_do	Polizei NRW DO	Polizei	Nordrhein-Westfalen	Dortmund	51.5142273	7.4652789
+polizei_nrw_du	Polizei NRW DU	Polizei	Nordrhein-Westfalen	Duisburg	51.434999	6.759562
+polizei_nrw_e	Polizei NRW E	Polizei	Nordrhein-Westfalen	Essen	51.4582235	7.0158171
+polizei_nrw_en	Polizei NRW EN	Polizei	Nordrhein-Westfalen	Ennepe-Ruhr-Kreis	51.3481444	7.3351844
+polizei_nrw_eu	Polizei NRW EU	Polizei	Nordrhein-Westfalen	Euskirchen	50.6612623	6.7871219
+polizei_nrw_ge	Polizei NRW GE	Polizei	Nordrhein-Westfalen	Gelsenkirchen	51.5110321	7.0960124
+polizei_nrw_gm	Polizei NRW GM	Polizei	Nordrhein-Westfalen	Gummersbach	51.0277658	7.5630545
+polizei_nrw_gt	Polizei NRW GT	Polizei	Nordrhein-Westfalen	Gütersloh	51.9063997	8.3782078
+polizei_nrw_ha	Polizei NRW HA	Polizei	Nordrhein-Westfalen	Hagen	51.3582945	7.473296
+polizei_nrw_ham	Polizei NRW HAM	Polizei	Nordrhein-Westfalen	Hamm	51.6804093	7.815197
+polizei_nrw_hf	Polizei NRW HF	Polizei	Nordrhein-Westfalen	Herford	52.1152245	8.6711118
+polizei_nrw_hs	Polizei NRW HS	Polizei	Nordrhein-Westfalen	Heinsberg	51.0654268	6.0984461
+polizei_nrw_hsk	Polizei NRW HSK	Polizei	Nordrhein-Westfalen	Hochsauerlandkreis	51.3208247	8.2684925
+polizei_nrw_hx	Polizei NRW HX	Polizei	Nordrhein-Westfalen	Höxter	51.7747369	9.3816877
+polizei_nrw_k	Polizei NRW K	Polizei	Nordrhein-Westfalen	Köln	50.938361	6.959974
+polizei_nrw_kle	Polizei NRW KLE	Polizei	Nordrhein-Westfalen	Kleve	51.7854839	6.1313674
+polizei_nrw_kr	Polizei NRW KR	Polizei	Nordrhein-Westfalen	Krefeld	51.3331205	6.5623343
+polizei_nrw_lip	Polizei NRW LIP	Polizei	Nordrhein-Westfalen	Detmold	51.936284	8.8791526
+polizei_nrw_lka	Polizei NRW LKA	Landeskriminalamt	Nordrhein-Westfalen	Düsseldorf	51.2254018	6.7763137
+polizei_nrw_me	polizei_nrw_me	Polizei	Nordrhein-Westfalen	Mettmann	51.2527778	6.9777778
+polizei_nrw_mg	Polizei NRW MG	Polizei	Nordrhein-Westfalen	Mönchengladbach	51.1946983	6.4353641
+polizei_nrw_mi	Polizei NRW MI	Polizei	Nordrhein-Westfalen	Minden	52.2881045	8.9168852
+polizei_nrw_mk	Polizei NRW MK	Polizei	Nordrhein-Westfalen	Märkischer Kreis	51.2734857	7.7274266
+polizei_nrw_ms	Polizei NRW MS	Polizei	Nordrhein-Westfalen	Münster	51.9625101	7.6251879
+polizei_nrw_ob	Polizei NRW OB	Polizei	Nordrhein-Westfalen	Oberhausen	51.4696137	6.8514435
+polizei_nrw_oe	Polizei NRW OE	Polizei	Nordrhein-Westfalen	Olpe	51.0297603	7.8424193
+polizei_nrw_pb	Polizei NRW PB	Polizei	Nordrhein-Westfalen	Paderborn	51.7189596	8.7648698
+polizei_nrw_rbk	Polizei NRW RBK	Polizei	Nordrhein-Westfalen	Rheinisch-Bergischer-Kreis	51.0139774	7.1715584
+polizei_nrw_re	Polizei NRW RE	Polizei	Nordrhein-Westfalen	Recklinghausen	51.6143815	7.1978546
+polizei_nrw_rek	Polizei NRW REK	Polizei	Nordrhein-Westfalen	Rhein-Erft-Kreis	50.90334	6.763334
+polizei_nrw_rkn	Polizei NRW RKN	Polizei	Nordrhein-Westfalen	Rhein-Kreis Neuss	51.1758799	6.6600606
+polizei_nrw_si	Polizei NRW SI	Polizei	Nordrhein-Westfalen	Siegen-Wittgenstein	50.97444	8.23972
+polizei_nrw_so	Polizei NRW SO	Polizei	Nordrhein-Westfalen	Soest	51.5725501	8.1061259
+polizei_nrw_st	Polizei NRW ST	Polizei	Nordrhein-Westfalen	Steinfurt	52.1294289	7.3903454
+polizei_nrw_su	Polizei NRW SU	Polizei	Nordrhein-Westfalen	Rhein-Sieg-Kreis	50.7527986	7.3813038
+polizei_nrw_un	Polizei NRW UN	Polizei	Nordrhein-Westfalen	Unna	51.5348835	7.689014
+polizei_nrw_vie	Polizei NRW VIE	Polizei	Nordrhein-Westfalen	Viersen	51.2562118	6.3905476
+polizei_nrw_w	Polizei NRW W	Polizei	Nordrhein-Westfalen	Wuppertal	51.264018	7.1780374
+polizei_nrw_waf	Polizei NRW WAF	Polizei	Nordrhein-Westfalen	Warendorf	51.9532449	7.9912335
+polizei_nrw_wes	Polizei NRW WES	Polizei	Nordrhein-Westfalen	Wesel	51.6576909	6.617087
+polizeiobn	Polizei Oberbayern N	Polizei	Bayern	Ingolstadt	48.7630165	11.4250395
+polizeiobs	PolizeiOberbayernSüd	Polizei	Bayern	Rosenheim	47.8539273	12.127262
+polizeiofr	Polizei Oberfranken	Polizei	Bayern	Oberfranken	50.0553084	11.5455233
+polizeiog	Polizei Offenburg	Polizei	Baden-Württemberg	Offenburg	48.4716556	7.944378
+polizei_oh	Polizei Osthessen	Polizei	Hessen	Fulda	50.5521486	9.676511
+polizei_oha	Polizei Osterode	Polizei	Niedersachsen	Osterode am Harz	51.72784	10.2508204
+polizei_ol	Polizei Oldenburg-Stadt/Ammerland	Polizei	Niedersachsen	Oldenburg	53.1389753	8.2146017
+polizeiopf	Polizei Oberpfalz	Polizei	Bayern	Regensburg	49.0195333	12.0974869
+polizei_os	Polizei Osnabrück	Polizei	Niedersachsen	Osnabrück	52.266837	8.049741
+polizei_pf	Polizei Pforzheim	Polizei	Baden-Württemberg	Pforzheim	48.8908846	8.7029532
+polizei_pp_nb	Polizeipräsidium NB	Polizeipräsidium	Mecklenburg-Vorpommern	Neubrandenburg	53.5574458	13.2602781
+polizei_pp_ros	Polizeipräsidium Rostock	Polizeipräsidium	Mecklenburg-Vorpommern	Rostock	54.0924445	12.1286127
+polizei_ps	Polizei Pirmasens	Polizei	Rheinland-Pfalz	Pirmasens	49.1996961	7.6087847
+polizei_rostock	Polizei Rostock	Polizei	Mecklenburg-Vorpommern	Rostock	54.0924445	12.1286127
+polizei_row	Polizei Rotenburg	Polizei	Niedersachsen	Rotenburg (Wümme)	53.2520924	9.3151133
+polizeirt	Polizei Reutlingen	Polizei	Baden-Württemberg	Reutlingen	48.4919508	9.2114144
+polizeirv	Polizei Ravensburg	Polizei	Baden-Württemberg	Ravensburg	47.7811014	9.612468
+polizeisaarland	Polizei Saarland	Polizei	Saarland	Saarbrücken	49.234362	6.996379
+polizeisachsen	Polizei Sachsen	Polizei	Sachsen	Dresden	51.0493286	13.7381437
+polizei_sdl	Polizei Stendal	Polizei	Sachsen-Anhalt	Stendal	52.6050782	11.8594279
+polizei_sn	Polizei Schwerin	Polizei	Mecklenburg-Vorpommern	Schwerin	53.6288297	11.4148038
+polizei_soh	Polizei Südosthessen	Polizei	Hessen	Offenbach am Main	50.1055002	8.7610698
+polizei_std	Polizei Stade	Polizei	Niedersachsen	Stade	53.599794	9.475438
+polizei_sth	Polizei Stadthagen	Polizei	Niedersachsen	Stadthagen	52.3289688	9.2053496
+polizei_suedhe	Polizei Südhessen	Polizei	Hessen	Darmstadt	49.872775	8.651177
+polizeiswn	Polizei Schwaben Nord	Polizei	Bayern	Augsburg	48.3668041	10.8986971
+polizeisws	Polizei Schwaben S/W	Polizei	Bayern	Kempten (Allgäu)	47.7267063	10.3168835
+polizei_sz	Polizei SZ / PE / WF	Polizei	Niedersachsen	Salzgitter	52.1503721	10.3593147
+polizei_thuer	Polizei Thüringen	Polizei	Thüringen	Erfurt	50.9777974	11.0287364
+polizeitrier	Polizei Trier	Polizei	Rheinland-Pfalz	Trier	49.7596208	6.6441878
+polizeiufr	Polizei Unterfranken	Polizei	Bayern	Würzburg	49.79245	9.932966
+polizeiul	Polizei Ulm	Polizei	Baden-Württemberg	Ulm	48.3974003	9.9934336
+polizei_ver_ohz	Polizei Verden/Osterholz	Polizei	Niedersachsen	Verden	52.922341	9.228153
+polizeivg	Polizei Vorpommern-Greifswald	Polizei	Mecklenburg-Vorpommern	Anklam	53.8560526	13.688091
+polizei_wh	Polizei Westhessen	Polizei	Hessen	Wiesbaden	50.0820384	8.2416556
+polizei_whv_fri	Polizei Wilhelmshaven/Friesland	Polizei	Niedersachsen	Wilhelmshaven	53.5278793	8.106301
+polizeiwittlich	Polizei Wittlich	Polizei	Rheinland-Pfalz	Wittlich	49.9850353	6.88844
+polizei_wl	Polizei LK Harburg	Polizei	Niedersachsen	Harburg	53.3172237	9.9084936
+polizei_wob	Polizei Wolfsburg	Polizei	Niedersachsen	Wolfsburg	52.4205588	10.7861682
+polizei_zpd_ni	Polizei ZPD NI	Polizei	Niedersachsen	Hannover	52.3744779	9.7385532
+pp_rheinpfalz	Polizei Rheinpfalz	Polizei	Rheinland-Pfalz	Ludwigshafen am Rhein	49.4704113	8.4381568
+pp_stuttgart	Polizei Stuttgart	Polizei	Baden-Württemberg	Stuttgart	48.7784485	9.1800132
+sh_polizei	Polizei SH	Polizei	Schleswig-Holstein	Kiel	54.3227085	10.135555
--- a/ergebnisse_hackathon_repo/team-16/keyword_search.py
+++ b/ergebnisse_hackathon_repo/team-16/keyword_search.py
@ -0,0 +1,39 @@
+import pandas as pd
+import spacy
+from string import punctuation
+from tqdm import tqdm
+tqdm.pandas()
+
+
+tw_tweets = pd.read_csv(r'data\copbird_table_tweet_ext_state.csv')
+
+nlp = spacy.load('de_core_news_lg')
+
+
+def clean_tweet(txt):
+    doc = nlp(txt)
+    token_list = []
+
+    for token in doc:
+        if (token.text not in punctuation) and (token.is_stop is False):
+            token_list.append(token.lemma_)
+        else:
+            pass
+    return ' '.join(token_list)
+
+
+def get_topics_by_str_lst(topic, df, col_name):
+    df_topiced = df[df[col_name].str.contains('|'.join(topic))]
+    return df_topiced
+
+
+if __name__ == '__main__':
+    topic_1 = ['demonstr', 'kundgeb']
+    topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']
+    topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']
+    topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']
+
+    df_pm = pd.read_csv(r'data\2020-12_2021-05_presseportal.csv', na_filter=False)
+    df_pm_col = 'content'
+
+    print(get_topics_by_str_lst(topic=topic_3, df=df_pm, col_name=df_pm_col).to_markdown())
--- a/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/Topic
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/Topic
@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7158ac22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pyLDAvis.sklearn\n",
+    "from sklearn.decomposition import LatentDirichletAllocation\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "import pandas as pd\n",
+    "import spacy\n",
+    "from multiprocess import Pool"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69f33a46",
+   "metadata": {},
+   "source": [
+    "Funktionen zur Vorverarbeitung"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1c66c06c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def filterSentencesByMinWordCount(text, minWordCount):\n",
+    "    sentenceList = []\n",
+    "    doc = nlp(text)\n",
+    "    for sent in doc.sents:\n",
+    "        wordList = []\n",
+    "        sent.text.rstrip()\n",
+    "        for word in sent:\n",
+    "            wordList.append(word)\n",
+    "        if len(wordList) >= minWordCount:\n",
+    "            sentenceList.append(sent.text.rstrip())\n",
+    "    return sentenceList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3b9d084d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def extractSentences(document):\n",
+    "    logging.debug('Extracting Sentences')\n",
+    "    text = extractBodyContent(document)\n",
+    "    sentenceList = filterSentencesByMinWordCount(text, 4)\n",
+    "    return sentenceList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7d85891e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def tokenizeSentence(doc):\n",
+    "    logging.debug('Tokenizing')\n",
+    "    tokenList = []\n",
+    "    for token in doc:\n",
+    "        childrenList = []\n",
+    "        for child in token.children:\n",
+    "            childToken = ScToken(child.text,\n",
+    "                                 child.lemma_,\n",
+    "                                 child.pos_, str(spacy.explain(child.pos_)),\n",
+    "                                 child.tag_, str(spacy.explain(child.tag_)),\n",
+    "                                 child.dep_, str(spacy.explain(child.dep_)),\n",
+    "                                 child.shape_, child.is_alpha, child.is_stop)\n",
+    "            childrenList.append(childToken)\n",
+    "\n",
+    "        scToken = ScToken(token.text,\n",
+    "                          token.lemma_,\n",
+    "                          token.pos_, str(spacy.explain(token.pos_)),\n",
+    "                          token.tag_, str(spacy.explain(token.tag_)),\n",
+    "                          token.dep_, str(spacy.explain(token.dep_)),\n",
+    "                          token.shape_, token.is_alpha, token.is_stop,\n",
+    "                          childrenList)\n",
+    "        tokenList.append(scToken)\n",
+    "    return tokenList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7564c883",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def chunkSentence(doc):\n",
+    "    logging.debug('Chunking')\n",
+    "    chunkList = []\n",
+    "    for chunk in doc.noun_chunks:\n",
+    "        scChunk = ScChunk(chunk.text, chunk.root.text,\n",
+    "                          chunk.root.dep_, chunk.root.head.text)\n",
+    "        chunkList.append(scChunk)\n",
+    "    return chunkList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5db74302",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def findEntitiesInSentence(doc):\n",
+    "    logging.debug('Extracting Named Entities')\n",
+    "    entityList = []\n",
+    "    for ent in doc.ents:\n",
+    "        entity = ScEntity(ent.text, ent.start_char, ent.end_char,\n",
+    "                          ent.label_, str(spacy.explain(ent.label_)))\n",
+    "        entityList.append(entity)\n",
+    "    return entityList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b6753a90",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def fillSentences(document):\n",
+    "    logging.info(\n",
+    "        'Building Sentences (Tokenizing, Chunking, Named Entity Recognition)')\n",
+    "    sentenceList = []\n",
+    "    sentences = extractSentences(document)\n",
+    "    for i, sentence in enumerate(sentences):\n",
+    "        doc = nlp(sentence)\n",
+    "        id = i\n",
+    "        tokens = tokenizeSentence(doc)\n",
+    "        chunks = chunkSentence(doc)\n",
+    "        entities = findEntitiesInSentence(doc)\n",
+    "        scSentence = ScSentence(id, sentence, tokens, chunks, entities)\n",
+    "        sentenceList.append(scSentence)\n",
+    "\n",
+    "    return sentenceList"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "9af7a5c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def fillSentencesAsOneString(sentences):\n",
+    "    sentencesAsOneString = str()\n",
+    "    for sentence in sentences:\n",
+    "        sentencesAsOneString += sentence.text\n",
+    "    return sentencesAsOneString"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8f952c82",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def removeStopwords(text):\n",
+    "    doc = nlp(text)\n",
+    "    tokens = tokenizeSentence(doc)\n",
+    "    chunksNoStopwords = [\n",
+    "        t.text for t in tokens if (not t.isStopword)]\n",
+    "    return \" \".join(chunksNoStopwords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "28910141",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'ScToken' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-14-7209b5cec518>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mnumberOfStopwords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mScToken\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m     \u001b[0mcount\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misStopword\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m             \u001b[0mcount\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'ScToken' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "def numberOfStopwords(tokens: [ScToken]):\n",
+    "    count = 0\n",
+    "    for t in tokens:\n",
+    "        if t.isStopword:\n",
+    "            count += 1\n",
+    "    return count"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce7cc9c1",
+   "metadata": {},
+   "source": [
+    "## Analyse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "1adb09b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    " tweet_csv = '../data/copbird_table_tweet.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c0936ecb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_TOPICS=10\n",
+    "NUM_FEATURES=1000\n",
+    "NUM_TOP_WORDS=25"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "dbf0281f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def get_tweets(path, limit=None):\n",
+    "    df_csv = pd.read_csv(path, nrows=limit, parse_dates=['created_at'],\n",
+    "                         encoding='utf-8-sig')\n",
+    "\n",
+    "    df_csv.drop(columns=['created_at', 'like_count', 'retweet_count', 'reply_count', 'quote_count'], inplace=True)\n",
+    "\n",
+    "    nlp = spacy.load(\"de_core_news_lg\")\n",
+    "    nlp.Defaults.stop_words |= {\"&amp\", \"amp\"}\n",
+    "    nlp.add_pipe('emoji', first=True)\n",
+    "    return list(\n",
+    "        nlp.pipe(df_csv['tweet_text'], disable=[\"tok2vec\", \"tagger\", \"parser\", \"attribute_ruler\"], n_process=-1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c39f658",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "copbird-env",
+   "language": "python",
+   "name": "copbird-env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/keywords-tweets-checkpoint.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/keywords-tweets-checkpoint.ipynb
--- a/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/pressemitteilung-selfmade-api-checkpoint.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/pressemitteilung-selfmade-api-checkpoint.ipynb
@ -0,0 +1,490 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cce66876",
+   "metadata": {},
+   "source": [
+    "# Interface Presseportal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f12d7022",
+   "metadata": {},
+   "source": [
+    "Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b07aef9f",
+   "metadata": {},
+   "source": [
+    "Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "258338d0",
+   "metadata": {},
+   "source": [
+    "Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b07fac3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import calendar\n",
+    "import time\n",
+    "import os\n",
+    "import csv\n",
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "from datetime import datetime\n",
+    "from bs4 import BeautifulSoup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dfce15a",
+   "metadata": {},
+   "source": [
+    "Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6c0b30a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Pressemitteilung:\n",
+    "    def __init__(self, article_id, timestamp, location, text, bundesland):\n",
+    "        self.article_id = article_id\n",
+    "        self.timestamp = timestamp\n",
+    "        self.location = location\n",
+    "        self.text = text\n",
+    "        self.bundesland=bundesland\n",
+    "    \n",
+    "    def __str__(self):\n",
+    "        return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n",
+    "    \n",
+    "    def to_row(self):\n",
+    "        return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63cceebe",
+   "metadata": {},
+   "source": [
+    "**Konstanten und Pfade**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8bcc877f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "REQUEST_HEADERS = {\n",
+    "    \"User-Agent\": (\n",
+    "        \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n",
+    "        \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n",
+    "    )\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c637ac38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_FOLDER = os.path.join(\"..\", \"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f094dee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BUNDESLAENDER = [\n",
+    "    \"baden-wuerttemberg\",\n",
+    "    \"bayern\",\n",
+    "    \"berlin-brandenburg\",\n",
+    "    \"bremen\",\n",
+    "    \"hamburg\",\n",
+    "    \"hessen\",\n",
+    "    \"mecklenburg-vorpommern\",\n",
+    "    \"niedersachsen\",\n",
+    "    \"nordrhein-westfalen\",\n",
+    "    \"rheinland-pfalz\",\n",
+    "    \"saarland\",\n",
+    "    \"sachsen\",\n",
+    "    \"sachsen-anhalt\",\n",
+    "    \"schleswig-holstein\",\n",
+    "    \"thueringen\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "84632391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def requests_get(request):\n",
+    "    return requests.get(request, headers=REQUEST_HEADERS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1af0bdbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_response(response, bundesland=None):\n",
+    "    \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n",
+    "    \n",
+    "    Args:\n",
+    "        response (:obj:`Response`)\n",
+    "        bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n",
+    "    \n",
+    "    Returns:\n",
+    "        list of :obj:`Pressemitteilung`\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    mitteilungen = []\n",
+    "    \n",
+    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+    "    for article in soup.find_all('article'):\n",
+    "        data_url = article['data-url']\n",
+    "        article_id = '-'.join(article['data-url'].split('/')[-2:])\n",
+    "        meta = article.find('div')\n",
+    "        \n",
+    "        timestamp_str = meta.find(class_=\"date\")\n",
+    "        \n",
+    "        if timestamp_str is not None:\n",
+    "            timestamp_str = timestamp_str.text\n",
+    "            timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y – %H:%M')\n",
+    "        else:\n",
+    "            timestamp = None\n",
+    "        \n",
+    "        location_str = meta.find(class_=\"news-topic\")\n",
+    "        location_str = location_str.text if location_str is not None else None\n",
+    "        \n",
+    "        p_texts = article.findAll('p')\n",
+    "        if len(p_texts) > 1:\n",
+    "            text = p_texts[1].text\n",
+    "        else:\n",
+    "            text = ''\n",
+    "        \n",
+    "        mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n",
+    "    \n",
+    "    return mitteilungen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c62c06c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n",
+    "    \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n",
+    "    \n",
+    "    Args:\n",
+    "        site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n",
+    "        location (:obj:`str`, default=None): Bundesland bzw. Stadt\n",
+    "        start_date (:obj:`str`, default=None)\n",
+    "        end_date (:obj:`str`, default=None)\n",
+    "    Returns:\n",
+    "        str: URL\n",
+    "    \"\"\"\n",
+    "    url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n",
+    "    \n",
+    "    if location is not None:\n",
+    "        url += f\"/l/{location}\"\n",
+    "    \n",
+    "    if site > 1:\n",
+    "        url += f\"/{site*30}\"\n",
+    "    \n",
+    "    if start_date is not None or end_date is not None:\n",
+    "        url += \"?\"\n",
+    "    \n",
+    "        if start_date is not None:\n",
+    "            url += f\"startDate={start_date}\"\n",
+    "        \n",
+    "            if end_date is not None:\n",
+    "                url += \"&\"\n",
+    "        \n",
+    "        if end_date is not None:\n",
+    "            url += f\"endDate={end_date}\"\n",
+    "    \n",
+    "    return url"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c67c9bc",
+   "metadata": {},
+   "source": [
+    "## Beispiel: Hamburg "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "aff924d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n",
+    "url"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6e2b9091",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n",
+      "[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n",
+      "[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n",
+      "[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n",
+      "[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n"
+     ]
+    }
+   ],
+   "source": [
+    "for mitteilung in extract_response(requests_get(url))[:5]:\n",
+    "    print(mitteilung)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e50af557",
+   "metadata": {},
+   "source": [
+    "## Effizientes Einlesen"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4a9580a",
+   "metadata": {},
+   "source": [
+    "Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "da927e30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n",
+    "    \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n",
+    "\n",
+    "    meldungen = []\n",
+    "    site = 1\n",
+    "    \n",
+    "    start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
+    "    end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
+    "    request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n",
+    "    \n",
+    "    new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
+    "    meldungen.extend(new_meldungen)\n",
+    "    \n",
+    "    pbar = tqdm(desc=bundesland)\n",
+    "    while len(new_meldungen) != 0:\n",
+    "        time.sleep(1)\n",
+    "        site += 1\n",
+    "        \n",
+    "        request = create_get_request(\n",
+    "            site=site, location=bundesland, start_date=start_date, end_date=end_date,\n",
+    "        )\n",
+    "        \n",
+    "        new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
+    "        meldungen.extend(new_meldungen)\n",
+    "        pbar.update(1)\n",
+    "    pbar.close()\n",
+    "        \n",
+    "    return meldungen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "85508758",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_meldungen_for_date(year, month, day):\n",
+    "    \"\"\"Extrahiere alle Meldungen für einen Tag\n",
+    "    \n",
+    "    Args:\n",
+    "        year (int): Jahr\n",
+    "        month (int): Monat\n",
+    "        day (int): Tag\n",
+    "    \"\"\"\n",
+    "\n",
+    "    meldungen_dict = {}\n",
+    "    \n",
+    "    for bundesland in BUNDESLAENDER:\n",
+    "        meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n",
+    "        meldungen_dict[bundesland] = meldungen\n",
+    "    \n",
+    "    return meldungen_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f938d8a9",
+   "metadata": {},
+   "source": [
+    "## Speichern der Daten in CSV-Dateien"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67374d3b",
+   "metadata": {},
+   "source": [
+    "Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "276e700d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_meldungen_in_csv(year, month, day):\n",
+    "    \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n",
+    "\n",
+    "    filename = f\"{year}-{month}-{day}_presseportal.csv\"\n",
+    "    path = os.path.join(DATA_FOLDER, filename)\n",
+    "    meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n",
+    "    \n",
+    "    with open(path, 'w', newline='', encoding='UTF8') as f:\n",
+    "        writer = csv.writer(f)\n",
+    "        writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n",
+    "        \n",
+    "        for bundesland, meldungen in meldungen_per_bundesland.items():\n",
+    "            for meldung in meldungen:\n",
+    "                writer.writerow(meldung.to_row())\n",
+    "    \n",
+    "    print(f\"File '{filename}' created\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c5d0bdbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_month(year, month):\n",
+    "    month_end_day = calendar.monthrange(year, month)[1]\n",
+    "    \n",
+    "    for i in range(0, month_end_day):\n",
+    "        store_meldungen_in_csv(year, month, i+1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9f3e24b",
+   "metadata": {},
+   "source": [
+    "## Auswertung: Wie viele Einträge pro Bundesland?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f600d3c",
+   "metadata": {},
+   "source": [
+    "Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "b7c85078",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counter = {}\n",
+    "\n",
+    "for filename in os.listdir('../data/'):\n",
+    "    if filename.endswith(\"_presseportal.csv\"):\n",
+    "        path = '../data/' + filename\n",
+    "        \n",
+    "        with open(path, 'r', encoding='UTF8') as f_in:\n",
+    "            reader = csv.reader(f_in)\n",
+    "            next(reader)\n",
+    "            for row in reader:\n",
+    "                bundesland = row[3]\n",
+    "                if bundesland not in counter:\n",
+    "                    counter[bundesland] = 1\n",
+    "                else:\n",
+    "                    counter[bundesland] += 1\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "copbird-env",
+   "language": "python",
+   "name": "copbird-env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/simons-notebook-checkpoint.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/simons-notebook-checkpoint.ipynb
--- a/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/tweet-pm-counts-checkpoint.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/.ipynb_checkpoints/tweet-pm-counts-checkpoint.ipynb
--- a/ergebnisse_hackathon_repo/team-16/notebooks/keywords-tweets.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/keywords-tweets.ipynb
--- a/ergebnisse_hackathon_repo/team-16/notebooks/pressemitteilung-selfmade-api.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/pressemitteilung-selfmade-api.ipynb
@ -0,0 +1,490 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cce66876",
+   "metadata": {},
+   "source": [
+    "# Interface Presseportal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f12d7022",
+   "metadata": {},
+   "source": [
+    "Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b07aef9f",
+   "metadata": {},
+   "source": [
+    "Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "258338d0",
+   "metadata": {},
+   "source": [
+    "Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b07fac3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import calendar\n",
+    "import time\n",
+    "import os\n",
+    "import csv\n",
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "from datetime import datetime\n",
+    "from bs4 import BeautifulSoup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dfce15a",
+   "metadata": {},
+   "source": [
+    "Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6c0b30a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Pressemitteilung:\n",
+    "    def __init__(self, article_id, timestamp, location, text, bundesland):\n",
+    "        self.article_id = article_id\n",
+    "        self.timestamp = timestamp\n",
+    "        self.location = location\n",
+    "        self.text = text\n",
+    "        self.bundesland=bundesland\n",
+    "    \n",
+    "    def __str__(self):\n",
+    "        return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n",
+    "    \n",
+    "    def to_row(self):\n",
+    "        return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63cceebe",
+   "metadata": {},
+   "source": [
+    "**Konstanten und Pfade**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8bcc877f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "REQUEST_HEADERS = {\n",
+    "    \"User-Agent\": (\n",
+    "        \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n",
+    "        \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n",
+    "    )\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c637ac38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_FOLDER = os.path.join(\"..\", \"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f094dee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BUNDESLAENDER = [\n",
+    "    \"baden-wuerttemberg\",\n",
+    "    \"bayern\",\n",
+    "    \"berlin-brandenburg\",\n",
+    "    \"bremen\",\n",
+    "    \"hamburg\",\n",
+    "    \"hessen\",\n",
+    "    \"mecklenburg-vorpommern\",\n",
+    "    \"niedersachsen\",\n",
+    "    \"nordrhein-westfalen\",\n",
+    "    \"rheinland-pfalz\",\n",
+    "    \"saarland\",\n",
+    "    \"sachsen\",\n",
+    "    \"sachsen-anhalt\",\n",
+    "    \"schleswig-holstein\",\n",
+    "    \"thueringen\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "84632391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def requests_get(request):\n",
+    "    return requests.get(request, headers=REQUEST_HEADERS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1af0bdbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_response(response, bundesland=None):\n",
+    "    \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n",
+    "    \n",
+    "    Args:\n",
+    "        response (:obj:`Response`)\n",
+    "        bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n",
+    "    \n",
+    "    Returns:\n",
+    "        list of :obj:`Pressemitteilung`\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    mitteilungen = []\n",
+    "    \n",
+    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+    "    for article in soup.find_all('article'):\n",
+    "        data_url = article['data-url']\n",
+    "        article_id = '-'.join(article['data-url'].split('/')[-2:])\n",
+    "        meta = article.find('div')\n",
+    "        \n",
+    "        timestamp_str = meta.find(class_=\"date\")\n",
+    "        \n",
+    "        if timestamp_str is not None:\n",
+    "            timestamp_str = timestamp_str.text\n",
+    "            timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y – %H:%M')\n",
+    "        else:\n",
+    "            timestamp = None\n",
+    "        \n",
+    "        location_str = meta.find(class_=\"news-topic\")\n",
+    "        location_str = location_str.text if location_str is not None else None\n",
+    "        \n",
+    "        p_texts = article.findAll('p')\n",
+    "        if len(p_texts) > 1:\n",
+    "            text = p_texts[1].text\n",
+    "        else:\n",
+    "            text = ''\n",
+    "        \n",
+    "        mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n",
+    "    \n",
+    "    return mitteilungen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c62c06c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n",
+    "    \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n",
+    "    \n",
+    "    Args:\n",
+    "        site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n",
+    "        location (:obj:`str`, default=None): Bundesland bzw. Stadt\n",
+    "        start_date (:obj:`str`, default=None)\n",
+    "        end_date (:obj:`str`, default=None)\n",
+    "    Returns:\n",
+    "        str: URL\n",
+    "    \"\"\"\n",
+    "    url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n",
+    "    \n",
+    "    if location is not None:\n",
+    "        url += f\"/l/{location}\"\n",
+    "    \n",
+    "    if site > 1:\n",
+    "        url += f\"/{site*30}\"\n",
+    "    \n",
+    "    if start_date is not None or end_date is not None:\n",
+    "        url += \"?\"\n",
+    "    \n",
+    "        if start_date is not None:\n",
+    "            url += f\"startDate={start_date}\"\n",
+    "        \n",
+    "            if end_date is not None:\n",
+    "                url += \"&\"\n",
+    "        \n",
+    "        if end_date is not None:\n",
+    "            url += f\"endDate={end_date}\"\n",
+    "    \n",
+    "    return url"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c67c9bc",
+   "metadata": {},
+   "source": [
+    "## Beispiel: Hamburg "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "aff924d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n",
+    "url"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6e2b9091",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n",
+      "[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n",
+      "[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n",
+      "[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n",
+      "[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n"
+     ]
+    }
+   ],
+   "source": [
+    "for mitteilung in extract_response(requests_get(url))[:5]:\n",
+    "    print(mitteilung)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e50af557",
+   "metadata": {},
+   "source": [
+    "## Effizientes Einlesen"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4a9580a",
+   "metadata": {},
+   "source": [
+    "Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "da927e30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n",
+    "    \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n",
+    "\n",
+    "    meldungen = []\n",
+    "    site = 1\n",
+    "    \n",
+    "    start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
+    "    end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n",
+    "    request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n",
+    "    \n",
+    "    new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
+    "    meldungen.extend(new_meldungen)\n",
+    "    \n",
+    "    pbar = tqdm(desc=bundesland)\n",
+    "    while len(new_meldungen) != 0:\n",
+    "        time.sleep(1)\n",
+    "        site += 1\n",
+    "        \n",
+    "        request = create_get_request(\n",
+    "            site=site, location=bundesland, start_date=start_date, end_date=end_date,\n",
+    "        )\n",
+    "        \n",
+    "        new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n",
+    "        meldungen.extend(new_meldungen)\n",
+    "        pbar.update(1)\n",
+    "    pbar.close()\n",
+    "        \n",
+    "    return meldungen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "85508758",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_meldungen_for_date(year, month, day):\n",
+    "    \"\"\"Extrahiere alle Meldungen für einen Tag\n",
+    "    \n",
+    "    Args:\n",
+    "        year (int): Jahr\n",
+    "        month (int): Monat\n",
+    "        day (int): Tag\n",
+    "    \"\"\"\n",
+    "\n",
+    "    meldungen_dict = {}\n",
+    "    \n",
+    "    for bundesland in BUNDESLAENDER:\n",
+    "        meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n",
+    "        meldungen_dict[bundesland] = meldungen\n",
+    "    \n",
+    "    return meldungen_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f938d8a9",
+   "metadata": {},
+   "source": [
+    "## Speichern der Daten in CSV-Dateien"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67374d3b",
+   "metadata": {},
+   "source": [
+    "Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "276e700d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_meldungen_in_csv(year, month, day):\n",
+    "    \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n",
+    "\n",
+    "    filename = f\"{year}-{month}-{day}_presseportal.csv\"\n",
+    "    path = os.path.join(DATA_FOLDER, filename)\n",
+    "    meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n",
+    "    \n",
+    "    with open(path, 'w', newline='', encoding='UTF8') as f:\n",
+    "        writer = csv.writer(f)\n",
+    "        writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n",
+    "        \n",
+    "        for bundesland, meldungen in meldungen_per_bundesland.items():\n",
+    "            for meldung in meldungen:\n",
+    "                writer.writerow(meldung.to_row())\n",
+    "    \n",
+    "    print(f\"File '{filename}' created\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c5d0bdbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_month(year, month):\n",
+    "    month_end_day = calendar.monthrange(year, month)[1]\n",
+    "    \n",
+    "    for i in range(0, month_end_day):\n",
+    "        store_meldungen_in_csv(year, month, i+1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9f3e24b",
+   "metadata": {},
+   "source": [
+    "## Auswertung: Wie viele Einträge pro Bundesland?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f600d3c",
+   "metadata": {},
+   "source": [
+    "Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "b7c85078",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counter = {}\n",
+    "\n",
+    "for filename in os.listdir('../data/'):\n",
+    "    if filename.endswith(\"_presseportal.csv\"):\n",
+    "        path = '../data/' + filename\n",
+    "        \n",
+    "        with open(path, 'r', encoding='UTF8') as f_in:\n",
+    "            reader = csv.reader(f_in)\n",
+    "            next(reader)\n",
+    "            for row in reader:\n",
+    "                bundesland = row[3]\n",
+    "                if bundesland not in counter:\n",
+    "                    counter[bundesland] = 1\n",
+    "                else:\n",
+    "                    counter[bundesland] += 1\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python-scientific kernel",
+   "language": "python",
+   "name": "python-scientific"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ergebnisse_hackathon_repo/team-16/notebooks/simons-notebook.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/simons-notebook.ipynb
--- a/ergebnisse_hackathon_repo/team-16/notebooks/tweet-pm-counts.ipynb
+++ b/ergebnisse_hackathon_repo/team-16/notebooks/tweet-pm-counts.ipynb
--- a/ergebnisse_hackathon_repo/team-16/py_matching_scripts/.ipynb_checkpoints/connect_id_state-checkpoint.py
+++ b/ergebnisse_hackathon_repo/team-16/py_matching_scripts/.ipynb_checkpoints/connect_id_state-checkpoint.py
@ -0,0 +1,40 @@
+"""
+Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding
+police station (user_id; name; handle)
+"""
+
+import pandas as pd
+from match_blaulich_tw_accounts import extend_blaulicht_data
+
+tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv')
+tw_user_data = pd.read_csv(r'data\copbird_table_user.csv')
+tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t')
+
+
+def get_tweets_by_user_id():
+    tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename(
+        columns={"id": "tweet_id"})
+    grouped_tweets = tweet_ids_user_ids.groupby('user_id')
+    return grouped_tweets
+
+
+def add_state_to_user_df():
+    tw_user_df = tw_user_data.rename(columns={"id": "user_id"})
+    tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"})
+
+    return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left')
+
+
+def add_state_to_tweets_df():
+    tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle'
+                                                                ]], on='user_id', how='left')
+    return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland'
+                          ]].rename(columns={'id': 'tweet_id', 'name': 'user_name'})
+
+
+def save_to_csv(df: pd, file_name: str):
+    df.to_csv(path_or_buf=f'{file_name}.csv', index=False)
+
+
+if __name__ == '__main__':
+    save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal')
--- a/ergebnisse_hackathon_repo/team-16/py_matching_scripts/.ipynb_checkpoints/match_blaulich_tw_accounts-checkpoint.py
+++ b/ergebnisse_hackathon_repo/team-16/py_matching_scripts/.ipynb_checkpoints/match_blaulich_tw_accounts-checkpoint.py
@ -0,0 +1,44 @@
+import pandas as pd
+from os import listdir
+from os.path import join, isdir
+
+df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'})
+dir_blaulicht = 'data/presseportal'
+
+def concat_blaulicht_dfs():
+    df = pd.DataFrame()
+    for dir in listdir(dir_blaulicht):
+        dir = join(dir_blaulicht, dir)
+        if isdir(dir):
+            for f in listdir(dir):
+                f = join(dir, f)
+                csv = pd.read_csv(f)
+                df = df.append(csv)
+    return df
+
+def extend_blaulicht_data():
+    df_blaulicht = concat_blaulicht_dfs()
+    mapping = map_bl_tw_citys()
+    df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping))
+    return df_blaulicht
+
+def find_location(txt, mp):
+    mapped_blaulicht = mp.get(txt, "")
+    return mapped_blaulicht[1] if mapped_blaulicht != "" else ""
+
+def map_bl_tw_citys():
+    import re
+    df_blaulicht = concat_blaulicht_dfs()
+    df_blaulicht.sort_index(inplace=True)
+    tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None))
+    tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1]
+    bl_locations = list(set([str(city) for city in df_blaulicht['location'].values]))
+    bl_tw_locations = {}
+    for bl_loc in bl_locations:
+        for tw_loc, tw_id in tw_locations:
+            if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()):
+                bl_tw_locations[bl_loc] = [tw_loc, tw_id]
+    return bl_tw_locations
+
+if __name__ == '__main__':
+    extend_blaulicht_data()
--- a/ergebnisse_hackathon_repo/team-16/py_matching_scripts/connect_id_state.py
+++ b/ergebnisse_hackathon_repo/team-16/py_matching_scripts/connect_id_state.py
@ -0,0 +1,40 @@
+"""
+Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding
+police station (user_id; name; handle)
+"""
+
+import pandas as pd
+from match_blaulich_tw_accounts import extend_blaulicht_data
+
+tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv')
+tw_user_data = pd.read_csv(r'data\copbird_table_user.csv')
+tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t')
+
+
+def get_tweets_by_user_id():
+    tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename(
+        columns={"id": "tweet_id"})
+    grouped_tweets = tweet_ids_user_ids.groupby('user_id')
+    return grouped_tweets
+
+
+def add_state_to_user_df():
+    tw_user_df = tw_user_data.rename(columns={"id": "user_id"})
+    tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"})
+
+    return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left')
+
+
+def add_state_to_tweets_df():
+    tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle'
+                                                                ]], on='user_id', how='left')
+    return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland'
+                          ]].rename(columns={'id': 'tweet_id', 'name': 'user_name'})
+
+
+def save_to_csv(df: pd, file_name: str):
+    df.to_csv(path_or_buf=f'{file_name}.csv', index=False)
+
+
+if __name__ == '__main__':
+    save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal')
--- a/ergebnisse_hackathon_repo/team-16/py_matching_scripts/match_blaulich_tw_accounts.py
+++ b/ergebnisse_hackathon_repo/team-16/py_matching_scripts/match_blaulich_tw_accounts.py
@ -0,0 +1,44 @@
+import pandas as pd
+from os import listdir
+from os.path import join, isdir
+
+df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'})
+dir_blaulicht = 'data/presseportal'
+
+def concat_blaulicht_dfs():
+    df = pd.DataFrame()
+    for dir in listdir(dir_blaulicht):
+        dir = join(dir_blaulicht, dir)
+        if isdir(dir):
+            for f in listdir(dir):
+                f = join(dir, f)
+                csv = pd.read_csv(f)
+                df = df.append(csv)
+    return df
+
+def extend_blaulicht_data():
+    df_blaulicht = concat_blaulicht_dfs()
+    mapping = map_bl_tw_citys()
+    df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping))
+    return df_blaulicht
+
+def find_location(txt, mp):
+    mapped_blaulicht = mp.get(txt, "")
+    return mapped_blaulicht[1] if mapped_blaulicht != "" else ""
+
+def map_bl_tw_citys():
+    import re
+    df_blaulicht = concat_blaulicht_dfs()
+    df_blaulicht.sort_index(inplace=True)
+    tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None))
+    tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1]
+    bl_locations = list(set([str(city) for city in df_blaulicht['location'].values]))
+    bl_tw_locations = {}
+    for bl_loc in bl_locations:
+        for tw_loc, tw_id in tw_locations:
+            if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()):
+                bl_tw_locations[bl_loc] = [tw_loc, tw_id]
+    return bl_tw_locations
+
+if __name__ == '__main__':
+    extend_blaulicht_data()
--- a/ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
+++ b/ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
@ -0,0 +1,512 @@
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
+tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
+pm_text <- pm$content
+pm_text <- pm_text[-which(is.na(pm_text))]  # remove missing values
+length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots"
+length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots"
+pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
+pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
+pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...>
+pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...>
+content_ber <- rep(NA, nrow(pm))
+content_ber <- rep(NA, nrow(pm))
+content_ber[which(!is.na(pm$content))] <- pm_text
+content_ber[which(!is.na(pm$content))] <- pm_text
+pm <- cbind(pm, content_ber)
+pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
+pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
+content_ber_satzzeichen <- rep(NA, nrow(pm))
+content_ber_satzzeichen <- rep(NA, nrow(pm))
+content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
+pm <- cbind(pm, content_ber_satzzeichen)
+head(pm)
+pm_text <- pm_demo$content
+pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
+pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...>
+content_ber <- rep(NA, nrow(pm_demo))
+content_ber[which(!is.na(pm_demo$content))] <- pm_text
+pm_demo <- cbind(pm_demo, content_ber)
+pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
+content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
+content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
+pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
+head(pm_demo)
+readAndflattenSentiWS <- function(filename) {
+words = readLines(filename, encoding="UTF-8")
+words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
+words <- unlist(strsplit(words, ","))
+words <- tolower(words)
+return(words)
+}
+pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/positive-words.txt"))
+neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/negative-words.txt"))
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+#require(plyr)
+require(stringr)
+scores = laply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# we just want a TRUE/FALSE:
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+#require(plyr)
+require(stringr)
+scores = lapply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# we just want a TRUE/FALSE:
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+ggplot(score_pm_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+#require(plyr)
+require(stringr)
+scores = lapply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# we just want a TRUE/FALSE:
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+library(plyr)
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+require(plyr)
+require(stringr)
+scores = lapply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# we just want a TRUE/FALSE:
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+require(plyr)
+require(stringr)
+scores = lapply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# we just want a TRUE/FALSE:
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
+require(plyr)
+require(stringr)
+scores = laply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# I don't just want a TRUE/FALSE! How can I do this?
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+ggplot(score_pm_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+ggplot(score_tw_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
+theme_minimal()
+View(score_tw_demo)
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(stringi)
+pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
+tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
+pm_text <- pm$content
+pm_text <- pm_text[-which(is.na(pm_text))]  # remove missing values
+length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots"
+pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
+pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...>
+content_ber <- rep(NA, nrow(pm))
+content_ber[which(!is.na(pm$content))] <- pm_text
+pm <- cbind(pm, content_ber)
+pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
+content_ber_satzzeichen <- rep(NA, nrow(pm))
+content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
+pm <- cbind(pm, content_ber_satzzeichen)
+head(pm)
+# csvpath <- <your path>
+# write_csv(pm, str_c(csvpath, "/pressemeldungen.csv"))
+pm_text <- pm_demo$content
+pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
+pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...>
+content_ber <- rep(NA, nrow(pm_demo))
+content_ber[which(!is.na(pm_demo$content))] <- pm_text
+pm_demo <- cbind(pm_demo, content_ber)
+pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
+content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
+content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
+pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
+head(pm_demo)
+readAndflattenSentiWS <- function(filename) {
+words = readLines(filename, encoding="UTF-8")
+words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
+words <- unlist(strsplit(words, ","))
+words <- tolower(words)
+return(words)
+}
+pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/positive-words.txt"))
+neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/negative-words.txt"))
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
+require(plyr)
+require(stringr)
+scores = laply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# I don't just want a TRUE/FALSE! How can I do this?
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+ggplot(score_pm_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+ggplot(score_tw_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
+theme_minimal()
+View(score_tw_demo)
+Ciew(score_pm_demo)
+View(score_pm_demo)
+score_pm_demo$text[3]
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(stringi)
+# Read in data
+pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+summary(pm)
+tweets <- read_csv("data/copbird_table_tweet.csv")
+tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
+usersX <- read_csv("data/copbird_table_user_ext.csv")
+# tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
+# tweetXstate$stadt[tweetXstate$user_name == "Polizei Oldenburg-Stadt/Ammerl"] <- "Oldenburg"
+# tweetXstate$stadt[tweetXstate$user_name == "Polizei Mecklenburgische Seenp"] <- "Neubrandenburg"
+# tweetXstate$stadt[tweetXstate$user_name == "Polizei Wilhelmshaven/Frieslan"] <- "Wilhelmshaven"
+# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Baden-Württember"] <- "Stuttgart"
+# tweetXstate$stadt[tweetXstate$user_name == "Landeskriminalamt Rheinland-Pf"] <- "Mainz"
+# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Mitteldeutschlan"] <- "Pirna"
+# tweetXstate$stadt[tweetXstate$user_name == "Polizei Delmenhorst/Oldenburg-"] <- "Delmenhorst"
+# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Flughafen Frankf"] <- "Frankfurt"
+# blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
+# users <- read_csv("data/copbird_table_user.csv")
+# str(users)
+# users$name <- as.factor(users$name)
+# users$handle <- as.factor(users$handle)
+pm_orte <- pm %>% group_by(bundesland) %>% count(location)
+head(pm_orte)
+head(pm_orte %>% arrange(desc(n)), n = 20)
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(stringi)
+pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+tweets <- read_csv("data/copbird_table_tweet.csv")
+tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
+usersX <- read_csv("data/copbird_table_user_ext.csv")
+tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
+blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
+pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
+tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
+pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
+tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
+pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
+tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
+head(usersX)
+head(tweetXstate[, 5:8])
+blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
+head(blaulicht[, -c(2, 5)])
+land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
+land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
+land_tw <- land_tw %>% group_by(bundesland) %>% count()
+land_tw$bundesland <- as.factor(land_tw$bundesland)
+land_pm <- pm %>% group_by(bundesland) %>% count()
+land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
+land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
+land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
+land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
+land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
+names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
+land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
+land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
+land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
+ggplot(land_pm_tw) +
+geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
+scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+facet_wrap(~Plattform) +
+coord_flip() +
+guides(fill = FALSE) +
+labs(title = "Anzahl der Pressemeldungen und Tweets",
+subtitle = "Im Zeitraum April bis Mai 2021") +
+theme_minimal()
+ggplot(land_pm_tw) +
+geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
+scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+coord_flip() +
+labs(title = "Anzahl der Pressemeldungen und Tweets",
+subtitle = "Im Zeitraum April bis Mai 2021") +
+theme_minimal()
+readAndflattenSentiWS <- function(filename) {
+words = readLines(filename, encoding="UTF-8")
+words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
+words <- unlist(strsplit(words, ","))
+words <- tolower(words)
+return(words)
+}
+pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/positive-words.txt"))
+neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
+readAndflattenSentiWS("data/negative-words.txt"))
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
+require(plyr)
+require(stringr)
+scores = laply(sentences, function(sentence, pos.words, neg.words)
+{
+# clean up sentences with R's regex-driven global substitute, gsub():
+sentence = gsub('[[:punct:]]', '', sentence)
+sentence = gsub('[[:cntrl:]]', '', sentence)
+sentence = gsub('\\d+', '', sentence)
+# and convert to lower case:
+sentence = tolower(sentence)
+# split into words. str_split is in the stringr package
+word.list = str_split(sentence, '\\s+')
+# sometimes a list() is one level of hierarchy too much
+words = unlist(word.list)
+# compare our words to the dictionaries of positive & negative terms
+pos.matches = match(words, pos.words)
+neg.matches = match(words, neg.words)
+# match() returns the position of the matched term or NA
+# I don't just want a TRUE/FALSE! How can I do this?
+pos.matches = !is.na(pos.matches)
+neg.matches = !is.na(neg.matches)
+# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+score = sum(pos.matches) - sum(neg.matches)
+return(score)
+},
+pos.words, neg.words, .progress=.progress )
+scores.df = data.frame(score=scores, text=sentences)
+return(scores.df)
+}
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+ggplot(score_pm_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+ggplot(score_tw_demo) +
+geom_bar(aes(x = score), fill = "blue") +
+labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
+theme_minimal()
+score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+ggplot(score_pm_drogen) +
+geom_bar(aes(x = score), fill = "darkgreen") +
+labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+ggplot(score_tw_drogen) +
+geom_bar(aes(x = score), fill = "darkgreen") +
+labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
+theme_minimal()
+score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
+score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
+ggplot(score_pm_rass) +
+geom_bar(aes(x = score), fill = "purple") +
+labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+theme_minimal()
+ggplot(score_tw_rass) +
+geom_bar(aes(x = score), fill = "purple") +
+labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
+theme_minimal()
--- a/ergebnisse_hackathon_repo/team-16/r-scripts/.ipynb_checkpoints/Presse
+++ b/ergebnisse_hackathon_repo/team-16/r-scripts/.ipynb_checkpoints/Presse
@ -0,0 +1,216 @@
+---
+title: "Team 16"
+author: "Christian, Simon und Cuca"
+date: "23 5 2021"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Daten einlesen
+```{r, message = FALSE}
+library(tidyverse)
+library(stringi)
+
+pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+
+tweets <- read_csv("data/copbird_table_tweet.csv")
+tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
+usersX <- read_csv("data/copbird_table_user_ext.csv")
+tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
+blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
+
+pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
+tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
+
+pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
+tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
+
+pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
+tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
+```
+
+
+# Scrapen der Pressemeldungen (seit Dezember 2020)
+
+# Zuordnung von Orten der Pressemeldungen und Tweets
+```{r}
+head(usersX)
+head(tweetXstate[, 5:8])
+blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
+head(blaulicht[, -c(2, 5)])
+```
+
+# Anzahl Pressemeldungen vs. Tweets
+```{r}
+land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
+land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
+land_tw <- land_tw %>% group_by(bundesland) %>% count()
+land_tw$bundesland <- as.factor(land_tw$bundesland)
+
+land_pm <- pm %>% group_by(bundesland) %>% count()
+land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
+land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
+land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
+land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
+
+land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
+names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
+land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
+land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
+land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
+
+ggplot(land_pm_tw) +
+  geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
+  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+  facet_wrap(~Plattform) +
+  coord_flip() +
+  guides(fill = FALSE) +
+  labs(title = "Anzahl der Pressemeldungen und Tweets", 
+       subtitle = "Im Zeitraum April bis Mai 2021") +
+  theme_minimal()
+
+ggplot(land_pm_tw) +
+  geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
+  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+  coord_flip() +
+  labs(title = "Anzahl der Pressemeldungen und Tweets", 
+       subtitle = "Im Zeitraum April bis Mai 2021") +
+  theme_minimal()
+```
+
+# Topic modelling
+```{r, message=FALSE}
+# library(quanteda)
+# library(tidyverse)
+# library(topicmodels)
+# library(ldatuning)
+# library(stm)
+# library(wordcloud)
+# 
+# pm <- pm[!is.na(pm$content), ]
+# tok <- tokens(pm$content_ber_satzzeichen)
+# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
+# mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
+# # mydfm.trim
+# 
+# anzahl.themen <- 10
+# anzahl.woerter <- 10
+# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
+# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
+# lda.modell
+# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
+# topmod
+# 
+# write_csv(topmod, "data/topicmodel.csv")
+```
+
+### Auswahl der Keywords
+`topic_1 = ['demonstr', 'kundgeb']`
+
+`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
+
+`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
+
+`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
+
+
+
+# Sentiment Analyse
+```{r}
+readAndflattenSentiWS <- function(filename) { 
+  words = readLines(filename, encoding="UTF-8")
+  words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
+  words <- unlist(strsplit(words, ","))
+  words <- tolower(words)
+  return(words)
+}
+
+pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T), 
+               readAndflattenSentiWS("SentiWS/positive-words.txt"))
+neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T), 
+              readAndflattenSentiWS("SentiWS/negative-words.txt"))
+
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
+  require(plyr)
+  require(stringr)
+  scores = laply(sentences, function(sentence, pos.words, neg.words) 
+  {
+    # clean up sentences with R's regex-driven global substitute, gsub():
+    sentence = gsub('[[:punct:]]', '', sentence)
+    sentence = gsub('[[:cntrl:]]', '', sentence)
+    sentence = gsub('\\d+', '', sentence)
+    # and convert to lower case:
+    sentence = tolower(sentence)
+    # split into words. str_split is in the stringr package
+    word.list = str_split(sentence, '\\s+')
+    # sometimes a list() is one level of hierarchy too much
+    words = unlist(word.list)
+    # compare our words to the dictionaries of positive & negative terms
+    pos.matches = match(words, pos.words)
+    neg.matches = match(words, neg.words)
+    # match() returns the position of the matched term or NA
+    # I don't just want a TRUE/FALSE! How can I do this?
+    pos.matches = !is.na(pos.matches)
+    neg.matches = !is.na(neg.matches)
+    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+    score = sum(pos.matches) - sum(neg.matches)
+    return(score)
+  }, 
+  pos.words, neg.words, .progress=.progress )
+  scores.df = data.frame(score=scores, text=sentences)
+  return(scores.df)
+}
+
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_demo) +
+  geom_bar(aes(x = score), fill = "blue") +
+  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_demo) +
+  geom_bar(aes(x = score), fill = "blue") +
+  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+
+score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_drogen) +
+  geom_bar(aes(x = score), fill = "darkgreen") +
+  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_drogen) +
+  geom_bar(aes(x = score), fill = "darkgreen") +
+  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+
+score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
+score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_rass) +
+  geom_bar(aes(x = score), fill = "purple") +
+  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_rass) +
+  geom_bar(aes(x = score), fill = "purple") +
+  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+```
+
+```{r}
+sessionInfo()
+```
--- a/ergebnisse_hackathon_repo/team-16/r-scripts/Presse
+++ b/ergebnisse_hackathon_repo/team-16/r-scripts/Presse
@ -0,0 +1,216 @@
+---
+title: "Team 16"
+author: "Christian, Simon und Cuca"
+date: "23 5 2021"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Daten einlesen
+```{r, message = FALSE}
+library(tidyverse)
+library(stringi)
+
+pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
+pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
+pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
+pm_list <- lapply(pm_csv, read_csv)
+pm <- do.call(rbind, pm_list)
+
+tweets <- read_csv("data/copbird_table_tweet.csv")
+tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
+usersX <- read_csv("data/copbird_table_user_ext.csv")
+tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
+blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
+
+pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
+tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
+
+pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
+tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
+
+pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
+tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
+```
+
+
+# Scrapen der Pressemeldungen (seit Dezember 2020)
+
+# Zuordnung von Orten der Pressemeldungen und Tweets
+```{r}
+head(usersX)
+head(tweetXstate[, 5:8])
+blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
+head(blaulicht[, -c(2, 5)])
+```
+
+# Anzahl Pressemeldungen vs. Tweets
+```{r}
+land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
+land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
+land_tw <- land_tw %>% group_by(bundesland) %>% count()
+land_tw$bundesland <- as.factor(land_tw$bundesland)
+
+land_pm <- pm %>% group_by(bundesland) %>% count()
+land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
+land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
+land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
+land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
+
+land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
+names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
+land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
+land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
+land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
+
+ggplot(land_pm_tw) +
+  geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
+  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+  facet_wrap(~Plattform) +
+  coord_flip() +
+  guides(fill = FALSE) +
+  labs(title = "Anzahl der Pressemeldungen und Tweets", 
+       subtitle = "Im Zeitraum April bis Mai 2021") +
+  theme_minimal()
+
+ggplot(land_pm_tw) +
+  geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
+  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
+  coord_flip() +
+  labs(title = "Anzahl der Pressemeldungen und Tweets", 
+       subtitle = "Im Zeitraum April bis Mai 2021") +
+  theme_minimal()
+```
+
+# Topic modelling
+```{r, message=FALSE}
+# library(quanteda)
+# library(tidyverse)
+# library(topicmodels)
+# library(ldatuning)
+# library(stm)
+# library(wordcloud)
+# 
+# pm <- pm[!is.na(pm$content), ]
+# tok <- tokens(pm$content_ber_satzzeichen)
+# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
+# mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
+# # mydfm.trim
+# 
+# anzahl.themen <- 10
+# anzahl.woerter <- 10
+# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
+# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
+# lda.modell
+# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
+# topmod
+# 
+# write_csv(topmod, "data/topicmodel.csv")
+```
+
+### Auswahl der Keywords
+`topic_1 = ['demonstr', 'kundgeb']`
+
+`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
+
+`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
+
+`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
+
+
+
+# Sentiment Analyse
+```{r}
+readAndflattenSentiWS <- function(filename) { 
+  words = readLines(filename, encoding="UTF-8")
+  words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
+  words <- unlist(strsplit(words, ","))
+  words <- tolower(words)
+  return(words)
+}
+
+pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T), 
+               readAndflattenSentiWS("SentiWS/positive-words.txt"))
+neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T), 
+              readAndflattenSentiWS("SentiWS/negative-words.txt"))
+
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
+  require(plyr)
+  require(stringr)
+  scores = laply(sentences, function(sentence, pos.words, neg.words) 
+  {
+    # clean up sentences with R's regex-driven global substitute, gsub():
+    sentence = gsub('[[:punct:]]', '', sentence)
+    sentence = gsub('[[:cntrl:]]', '', sentence)
+    sentence = gsub('\\d+', '', sentence)
+    # and convert to lower case:
+    sentence = tolower(sentence)
+    # split into words. str_split is in the stringr package
+    word.list = str_split(sentence, '\\s+')
+    # sometimes a list() is one level of hierarchy too much
+    words = unlist(word.list)
+    # compare our words to the dictionaries of positive & negative terms
+    pos.matches = match(words, pos.words)
+    neg.matches = match(words, neg.words)
+    # match() returns the position of the matched term or NA
+    # I don't just want a TRUE/FALSE! How can I do this?
+    pos.matches = !is.na(pos.matches)
+    neg.matches = !is.na(neg.matches)
+    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+    score = sum(pos.matches) - sum(neg.matches)
+    return(score)
+  }, 
+  pos.words, neg.words, .progress=.progress )
+  scores.df = data.frame(score=scores, text=sentences)
+  return(scores.df)
+}
+
+score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_demo) +
+  geom_bar(aes(x = score), fill = "blue") +
+  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_demo) +
+  geom_bar(aes(x = score), fill = "blue") +
+  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+
+score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
+score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_drogen) +
+  geom_bar(aes(x = score), fill = "darkgreen") +
+  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_drogen) +
+  geom_bar(aes(x = score), fill = "darkgreen") +
+  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+
+score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
+score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
+
+ggplot(score_pm_rass) +
+  geom_bar(aes(x = score), fill = "purple") +
+  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
+  theme_minimal()
+
+ggplot(score_tw_rass) +
+  geom_bar(aes(x = score), fill = "purple") +
+  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
+  theme_minimal()
+```
+
+```{r}
+sessionInfo()
+```
--- a/ergebnisse_hackathon_repo/team-16/r-scripts/copbird-group-16.Rproj
+++ b/ergebnisse_hackathon_repo/team-16/r-scripts/copbird-group-16.Rproj
@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
--- a/ergebnisse_hackathon_repo/team-16/requirements.txt
+++ b/ergebnisse_hackathon_repo/team-16/requirements.txt
@ -0,0 +1,131 @@
+appdirs==1.4.4
+argon2-cffi==20.1.0
+async-generator==1.10
+attrs==21.2.0
+backcall==0.2.0
+beautifulsoup4==4.9.3
+bleach==3.3.0
+blis==0.7.4
+branca==0.4.2
+bs4==0.0.1
+catalogue==2.0.4
+certifi==2020.12.5
+cffi==1.14.5
+chardet==4.0.0
+click==7.1.2
+cssselect==1.1.0
+cycler==0.10.0
+cymem==2.0.5
+decorator==4.4.2
+defusedxml==0.7.1
+dill==0.3.3
+docker==4.4.4
+emoji==0.6.0
+entrypoints==0.3
+fake-useragent==0.1.11
+filelock==3.0.12
+folium==0.12.1
+funcy==1.16
+future==0.18.2
+germansentiment==1.0.5
+huggingface-hub==0.0.8
+idna==2.10
+ipykernel==5.5.5
+ipython==7.23.1
+ipython-genutils==0.2.0
+ipywidgets==7.6.3
+jedi==0.18.0
+Jinja2
+joblib==1.0.1
+jsonpickle==2.0.0
+jsonschema==3.2.0
+jupyter==1.0.0
+jupyter-client==6.1.12
+jupyter-console==6.4.0
+jupyter-core==4.7.1
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.0
+kiwisolver==1.3.1
+loguru==0.5.3
+lxml==4.6.3
+MarkupSafe==2.0.1
+matplotlib==3.4.2
+matplotlib-inline==0.1.2
+mistune==0.8.4
+multiprocess==0.70.11.1
+murmurhash==1.0.5
+nbclient==0.5.3
+nbconvert==6.0.7
+nbformat==5.1.3
+nest-asyncio==1.5.1
+networkx==2.5.1
+nitter-scraper==0.5.0
+notebook==6.4.0
+numexpr==2.7.3
+numpy==1.20.3
+packaging==20.9
+pandas==1.2.4
+pandocfilters==1.4.3
+parse==1.19.0
+parso==0.8.2
+pathy==0.5.2
+pendulum==2.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==8.2.0
+preshed==3.0.5
+prometheus-client==0.10.1
+prompt-toolkit==3.0.18
+ptyprocess==0.7.0
+pycparser==2.20
+pydantic==1.7.4
+pyee==8.1.0
+Pygments==2.9.0
+pyLDAvis==3.3.1
+pyparsing==2.4.7
+pyppeteer==0.2.5
+pyquery==1.4.3
+pyrsistent==0.17.3
+python-dateutil==2.8.1
+pytz==2021.1
+pytzdata==2020.1
+pyvis==0.1.9
+pyzmq==22.0.3
+qtconsole==5.1.0
+QtPy==1.9.0
+regex==2021.4.4
+requests==2.25.1
+requests-html==0.10.0
+sacremoses==0.0.45
+scikit-learn==0.24.2
+scipy==1.6.3
+seaborn==0.11.1
+Send2Trash==1.5.0
+six==1.16.0
+sklearn==0.0
+smart-open
+soupsieve==2.2.1
+spacy==3.0.6
+spacy-legacy==3.0.5
+spacymoji==3.0.1
+srsly==2.4.1
+terminado==0.10.0
+testpath==0.5.0
+thinc==8.0.3
+threadpoolctl==2.1.0
+tokenizers==0.10.2
+torch==1.8.1
+tornado==6.1
+tqdm==4.60.0
+traitlets==5.0.5
+transformers==4.6.0
+typer==0.3.2
+typing-extensions==3.10.0.0
+urllib3==1.26.4
+w3lib==1.22.0
+wasabi==0.8.2
+wcwidth==0.2.5
+webencodings==0.5.1
+websocket-client==1.0.0
+websockets==8.1
+widgetsnbextension==3.5.1