init
This commit is contained in:
		
						commit
						8d3c8b3974
					
				
					 77 changed files with 682928 additions and 0 deletions
				
			
		
							
								
								
									
										
											BIN
										
									
								
								ergebnisse_hackathon_repo/team-16/Presse-vs.-Twitter.pdf
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								ergebnisse_hackathon_repo/team-16/Presse-vs.-Twitter.pdf
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										20
									
								
								ergebnisse_hackathon_repo/team-16/README.md
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								ergebnisse_hackathon_repo/team-16/README.md
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| # CopBird Group 16 | ||||
| 
 | ||||
| ## Tools | ||||
| 
 | ||||
| * [Python](https://www.python.org/) Version >= 3.8 | ||||
| * [Wekan](https://wekan.github.io/) als Arbeitsgrundlage mit einem Gruppen-Wekan-Board sowie Gesamt-Boards | ||||
| * [Matrix](https://matrix.org/) zur Kommunikation | ||||
| 
 | ||||
| Jupyter Notebook verwendet Kernels. Um ein Environment als Kernel zu verwenden, gibt es folgende | ||||
| Anleitung: https://queirozf.com/entries/jupyter-kernels-how-to-add-change-remove | ||||
| 
 | ||||
| ## Daten | ||||
| 
 | ||||
| Die Tweets können nicht öffentlich gemacht werden, jedoch sind die Pressemitteilungen und Sentiment-Wörter unter data/ zu finden. | ||||
| 
 | ||||
| ## Ergebnisse | ||||
| 
 | ||||
| Die wichtigsten Ergebnisse befinden sich in der [Presse-vs.-Twitter](Presse-vs.-Twitter.pdf) PDF. | ||||
| In den notebooks finden sich zusätzliche Details zur Datenextraktion und Analyse. | ||||
| 
 | ||||
							
								
								
									
										0
									
								
								ergebnisse_hackathon_repo/team-16/data/.gitkeep
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								ergebnisse_hackathon_repo/team-16/data/.gitkeep
									
										
									
									
									
										Normal file
									
								
							
							
								
								
									
										
											BIN
										
									
								
								ergebnisse_hackathon_repo/team-16/data/SentiWS.zip
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								ergebnisse_hackathon_repo/team-16/data/SentiWS.zip
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							|  | @ -0,0 +1,164 @@ | |||
| Polizei Account	Name	Typ	Bundesland	Stadt	LAT	LONG | ||||
| bpol_11 	Bundespolizei Spezialkräfte	Bundespolizei	-	-	-	 | ||||
| bpol_bepo 	Bundesbereitschaftspolizei	Bundesbereitschaftspolizei	-	-	-	- | ||||
| bpol_air_fra 	Bundespolizei Flughafen Frankfurt am Main	Bundespolizei	Hessen	Frankfurt am Main	50.1109221	8.6821267 | ||||
| bpol_b 	Bundespolizei Berlin	Bundespolizei	Berlin	Berlin	52.520007	13.404954 | ||||
| bpol_b_einsatz 	Bundespolizei Berlin Einsatz	Bundespolizei	Berlin	Berlin	52.520007	13.404954 | ||||
| bpol_bw 	Bundespolizei Baden-Württemberg	Bundespolizei	Baden-Württemberg	Böblingen	48.681331	9.008830 | ||||
| bpol_by 	Bundespolizei Bayern	Bundespolizei	Bayern	München	48.135125	11.581981 | ||||
| bpol_koblenz 	Bundespolizei Koblenz	Bundespolizei	Rheinland-Pfalz	Koblenz	50.356943	7.588996 | ||||
| bpol_kueste 	Bundespolizei Küste	Bundespolizei	Schleswig-Holstein	Bad Bramstedt	53.919582	9.882173 | ||||
| bpol_nord 	Bundespolizei Nord	Bundespolizei	Niedersachsen	Hannover	52.3744779	9.7385532 | ||||
| bpol_nrw 	Bundespolizei NRW	Bundespolizei	Nordrhein-Westfalen	Sankt Augustin	50.769478	7.187579 | ||||
| bpol_pir 	Bundespolizei Mitteldeutschland	Bundespolizei	Sachsen-Anhalt	Magdeburg	52.120533	11.627624 | ||||
| bremenpolizei 	Polizei Bremen	Polizei	Bremen	Bremen	53.079296	8.801694 | ||||
| lkabawue	Landeskriminalamt BW	Landeskriminalamt	Baden-Württemberg	Stuttgart	48.775846	9.182932 | ||||
| lka_bayern	Bayerisches Landeskriminalamt	Landeskriminalamt	Bayern	München	48.135125	11.581981 | ||||
| lka_hessen 	Hessisches Landeskriminalamt	Landeskriminalamt	Hessen	Wiesbaden	50.0820384	8.2416556 | ||||
| lka_rlp 	Landeskriminalamt Rheinland-Pfalz	Landeskriminalamt	Rheinland-Pfalz	Mainz	49.992862	8.247253 | ||||
| pol_grafschaft 	Polizei Grf Bentheim	Polizei	Niedersachsen	Nordhorn	52.429580	7.068571 | ||||
| polizeiaalen 	Polizei Aalen	Polizei	Baden-Württemberg	Aalen	48.836689	10.097116 | ||||
| polizei_aur_wtm 	Polizei Aurich / WTM	Polizei	Niedersachsen	Aurich	53.470839	7.484831 | ||||
| polizei_badn 	Polizei Bad Nenndorf	Polizei	Niedersachsen	Bad Nenndorf	52.336191	9.374258 | ||||
| polizeibayern 	Polizei Bayern	Polizei	Bayern	München	48.135125	11.581981 | ||||
| polizeibb 	Polizei Brandenburg	Polizei	Brandenburg	Potsdam	52.390569	13.064473 | ||||
| polizeibb_e 	PolizeiBrandenburg_E	Polizei	Brandenburg	Potsdam	52.390569	13.064473 | ||||
| polizei_bbg 	Polizei Bückeburg	Polizei	Niedersachsen	Bückeburg	52.259276	9.052123 | ||||
| polizeiberlin 	Polizei Berlin	Polizei	Berlin	Berlin	52.520007	13.404954 | ||||
| polizeiberlin_e 	Polizei Berlin Einsatz	Polizei	Berlin	Berlin	52.520007	13.404954 | ||||
| polizeibhv 	Polizei Bremerhaven	Polizei	Bremen	Bremerhaven	53.539584	8.580942 | ||||
| polizei_bs 	Polizei Braunschweig	Polizei	Niedersachsen	Braunschweig	52.268874	10.526770 | ||||
| polizei_ce 	Polizei Celle	Polizei	Niedersachsen	Celle	52.617596	10.062852 | ||||
| polizei_clp_vec 	Polizei Cloppenburg/Vechta	Polizei	Niedersachsen	Cloppenburg	52.844198	8.053016 | ||||
| polizei_cux 	Polizei Cuxhaven	Polizei	Niedersachsen	Cuxhaven	53.859336	8.687906 | ||||
| polizei_del 	Polizei Delmenhorst/Oldenburg-Land/Wesermarsch	Polizei	Niedersachsen	Delmenhorst	53.052189	8.635593 | ||||
| polizei_dero 	Polizei Dessau-Roßlau	Polizei	Sachsen-Anhalt	Dessau-Roßlau	51.842828	12.230393 | ||||
| polizei_dh 	Polizei Diepholz	Polizei	Niedersachsen	Diepholz	52.605646	8.370788 | ||||
| polizei_el 	Polizei Emsland	Polizei	Niedersachsen	Lingen	52.540308	7.329286 | ||||
| polizei_ffm 	Polizei Frankfurt	Polizei	Hessen	Frankfurt am Main	50.110922	8.682127 | ||||
| polizeifr 	Polizei Freiburg	Polizei	Baden-Württemberg	Freiburg	47.999008	7.842104 | ||||
| polizei_ft 	Polizei Frankenthal	Polizei	Rheinland-Pfalz	Frankenthal 	49.533333	8.350000 | ||||
| polizei_ger 	Polizei Germersheim	Polizei	Rheinland-Pfalz	Germersheim	49.214024	8.366815 | ||||
| polizei_gf 	Polizei Gifhorn	Polizei	Niedersachsen	Gifhorn	52.480909	10.550783 | ||||
| polizei_goe 	Polizei Göttingen	Polizei	Niedersachsen	Göttingen	51.541280	9.915804 | ||||
| polizei_gs 	Polizei Goslar	Polizei	Niedersachsen	Goslar	51.905953	10.428996 | ||||
| polizei_h	Polizei Hannover	Polizei	Niedersachsen	Hannover	52.3744779	9.7385532 | ||||
| polizei_hal	Polizei Halle (Saale)	Polizei	Sachsen-Anhalt	Halle (Saale)	51.4825041	11.9705452 | ||||
| polizeihamburg	Polizei Hamburg	Polizei	Hamburg	Hamburg	53.550341	10.000654 | ||||
| polizei_hi	Polizei Hildesheim	Polizei	Niedersachsen	Hildesheim	52.1521636	9.9513046 | ||||
| polizei_hk	Polizei Heidekreis	Polizei	Niedersachsen	Soltau	52.9859666	9.8433909 | ||||
| polizei_hm	Polizei Hameln	Polizei	Niedersachsen	Hameln-Pyrmont	52.0895789	9.3875409 | ||||
| polizeihn	Polizei Heilbronn	Polizei	Baden-Württemberg	Heilbronn	49.142291	9.218655 | ||||
| polizei_hol	Polizei Holzminden	Polizei	Niedersachsen	Holzminden	51.828835	9.4466591 | ||||
| polizei_hst	Polizei Stralsund	Polizei	Mecklenburg-Vorpommern	Stralsund	54.3096314	13.0820846 | ||||
| polizei_ka	Polizei Karlsruhe	Polizei	Baden-Württemberg	Karlsruhe	49.0068705	8.4034195 | ||||
| polizei_kl	Polizei Kaiserslautern	Polizei	Rheinland-Pfalz	Kaiserslautern	49.4432174	7.7689951 | ||||
| polizei_ko	Polizei Koblenz	Polizei	Rheinland-Pfalz	Koblenz	50.3533278	7.5943951 | ||||
| polizeikonstanz	Polizei Konstanz	Polizei	Baden-Württemberg	Konstanz	47.659216	9.1750718 | ||||
| polizeilb	Polizei Ludwigsburg	Polizei	Baden-Württemberg	Ludwigsburg	48.8953937	9.1895147 | ||||
| polizei_ler_emd	Polizei Leer / Emden	Polizei	Niedersachsen	Leer	53.2327625	7.4577265 | ||||
| polizei_lg	Polizei Lüneburg	Polizei	Niedersachsen	Lüneburg	53.248706	10.407855 | ||||
| polizeimainz	Polizei Mainz	Polizei	Rheinland-Pfalz	Mainz	50.0012314	8.2762513 | ||||
| polizeimannheim	Polizei Mannheim	Polizei	Baden-Württemberg	Mannheim	49.4892913	8.4673098 | ||||
| polizei_md	Polizei Magdeburg	Polizei	Sachsen-Anhalt	Magdeburg	52.1315889	11.6399609 | ||||
| polizeimfr	Polizei Mittelfranken	Polizei	Bayern	Nürnberg	49.453872	11.077298 | ||||
| polizei_mh	Polizei Mittelhessen	Polizei	Hessen	Gießen	50.5862066	8.6742306 | ||||
| polizei_mse	Polizei Mecklenburgische Seenplatte	Polizei	Mecklenburg-Vorpommern	Neubrandenburg	53.5574458	13.2602781 | ||||
| polizeimuenchen	Polizei München	Polizei	Bayern	München	48.135125	11.581981 | ||||
| polizeinb	Polizei Niederbayern	Polizei	Bayern	Straubing	48.8819801	12.569716 | ||||
| polizei_nbg	Polizei Nienburg	Polizei	Niedersachsen	Nienburg (Weser)	52.6487602	9.2578105 | ||||
| polizeineustadt	Polizei Neustadt	Polizei	Rheinland-Pfalz	Neustadt an der Weinstraße	49.3539802	8.1350021 | ||||
| polizei_nh	Polizei Nordhessen	Polizei	Hessen	Kassel	51.3154546	9.4924096 | ||||
| polizeini_lka	LKA Niedersachsen	Landeskriminalamt	Niedersachsen	Hannover	52.3744779	9.7385532 | ||||
| polizei_nom	Polizei Northeim	Polizei	Niedersachsen	Northeim	51.705401	9.9972782 | ||||
| polizei_nrw_ac	Polizei NRW AC	Polizei	Nordrhein-Westfalen	Aachen	50.776351	6.083862 | ||||
| polizei_nrw_bi	Polizei NRW BI	Polizei	Nordrhein-Westfalen	Bielefeld	52.0191005	8.531007 | ||||
| polizei_nrw_bn	Polizei NRW BN	Polizei	Nordrhein-Westfalen	Bonn	50.735851	7.10066 | ||||
| polizei_nrw_bo	Polizei NRW BO	Polizei	Nordrhein-Westfalen	Bochum	51.4818111	7.2196635 | ||||
| polizei_nrw_bor	Polizei NRW BOR	Polizei	Nordrhein-Westfalen	Borken	51.8443183	6.8582247 | ||||
| polizei_nrw_coe	Polizei NRW COE	Polizei	Nordrhein-Westfalen	Coesfeld	51.9458943	7.1691108 | ||||
| polizei_nrw_d	Polizei NRW D	Polizei	Nordrhein-Westfalen	Düsseldorf	51.2254018	6.7763137 | ||||
| polizei_nrw_dn	Polizei NRW DN	Polizei	Nordrhein-Westfalen	Düren	50.8031684	6.4820806 | ||||
| polizei_nrw_do	Polizei NRW DO	Polizei	Nordrhein-Westfalen	Dortmund	51.5142273	7.4652789 | ||||
| polizei_nrw_du	Polizei NRW DU	Polizei	Nordrhein-Westfalen	Duisburg	51.434999	6.759562 | ||||
| polizei_nrw_e	Polizei NRW E	Polizei	Nordrhein-Westfalen	Essen	51.4582235	7.0158171 | ||||
| polizei_nrw_en	Polizei NRW EN	Polizei	Nordrhein-Westfalen	Ennepe-Ruhr-Kreis	51.3481444	7.3351844 | ||||
| polizei_nrw_eu	Polizei NRW EU	Polizei	Nordrhein-Westfalen	Euskirchen	50.6612623	6.7871219 | ||||
| polizei_nrw_ge	Polizei NRW GE	Polizei	Nordrhein-Westfalen	Gelsenkirchen	51.5110321	7.0960124 | ||||
| polizei_nrw_gm	Polizei NRW GM	Polizei	Nordrhein-Westfalen	Gummersbach	51.0277658	7.5630545 | ||||
| polizei_nrw_gt	Polizei NRW GT	Polizei	Nordrhein-Westfalen	Gütersloh	51.9063997	8.3782078 | ||||
| polizei_nrw_ha	Polizei NRW HA	Polizei	Nordrhein-Westfalen	Hagen	51.3582945	7.473296 | ||||
| polizei_nrw_ham	Polizei NRW HAM	Polizei	Nordrhein-Westfalen	Hamm	51.6804093	7.815197 | ||||
| polizei_nrw_hf	Polizei NRW HF	Polizei	Nordrhein-Westfalen	Herford	52.1152245	8.6711118 | ||||
| polizei_nrw_hs	Polizei NRW HS	Polizei	Nordrhein-Westfalen	Heinsberg	51.0654268	6.0984461 | ||||
| polizei_nrw_hsk	Polizei NRW HSK	Polizei	Nordrhein-Westfalen	Hochsauerlandkreis	51.3208247	8.2684925 | ||||
| polizei_nrw_hx	Polizei NRW HX	Polizei	Nordrhein-Westfalen	Höxter	51.7747369	9.3816877 | ||||
| polizei_nrw_k	Polizei NRW K	Polizei	Nordrhein-Westfalen	Köln	50.938361	6.959974 | ||||
| polizei_nrw_kle	Polizei NRW KLE	Polizei	Nordrhein-Westfalen	Kleve	51.7854839	6.1313674 | ||||
| polizei_nrw_kr	Polizei NRW KR	Polizei	Nordrhein-Westfalen	Krefeld	51.3331205	6.5623343 | ||||
| polizei_nrw_lip	Polizei NRW LIP	Polizei	Nordrhein-Westfalen	Detmold	51.936284	8.8791526 | ||||
| polizei_nrw_lka	Polizei NRW LKA	Landeskriminalamt	Nordrhein-Westfalen	Düsseldorf	51.2254018	6.7763137 | ||||
| polizei_nrw_me	polizei_nrw_me	Polizei	Nordrhein-Westfalen	Mettmann	51.2527778	6.9777778 | ||||
| polizei_nrw_mg	Polizei NRW MG	Polizei	Nordrhein-Westfalen	Mönchengladbach	51.1946983	6.4353641 | ||||
| polizei_nrw_mi	Polizei NRW MI	Polizei	Nordrhein-Westfalen	Minden	52.2881045	8.9168852 | ||||
| polizei_nrw_mk	Polizei NRW MK	Polizei	Nordrhein-Westfalen	Märkischer Kreis	51.2734857	7.7274266 | ||||
| polizei_nrw_ms	Polizei NRW MS	Polizei	Nordrhein-Westfalen	Münster	51.9625101	7.6251879 | ||||
| polizei_nrw_ob	Polizei NRW OB	Polizei	Nordrhein-Westfalen	Oberhausen	51.4696137	6.8514435 | ||||
| polizei_nrw_oe	Polizei NRW OE	Polizei	Nordrhein-Westfalen	Olpe	51.0297603	7.8424193 | ||||
| polizei_nrw_pb	Polizei NRW PB	Polizei	Nordrhein-Westfalen	Paderborn	51.7189596	8.7648698 | ||||
| polizei_nrw_rbk	Polizei NRW RBK	Polizei	Nordrhein-Westfalen	Rheinisch-Bergischer-Kreis	51.0139774	7.1715584 | ||||
| polizei_nrw_re	Polizei NRW RE	Polizei	Nordrhein-Westfalen	Recklinghausen	51.6143815	7.1978546 | ||||
| polizei_nrw_rek	Polizei NRW REK	Polizei	Nordrhein-Westfalen	Rhein-Erft-Kreis	50.90334	6.763334 | ||||
| polizei_nrw_rkn	Polizei NRW RKN	Polizei	Nordrhein-Westfalen	Rhein-Kreis Neuss	51.1758799	6.6600606 | ||||
| polizei_nrw_si	Polizei NRW SI	Polizei	Nordrhein-Westfalen	Siegen-Wittgenstein	50.97444	8.23972 | ||||
| polizei_nrw_so	Polizei NRW SO	Polizei	Nordrhein-Westfalen	Soest	51.5725501	8.1061259 | ||||
| polizei_nrw_st	Polizei NRW ST	Polizei	Nordrhein-Westfalen	Steinfurt	52.1294289	7.3903454 | ||||
| polizei_nrw_su	Polizei NRW SU	Polizei	Nordrhein-Westfalen	Rhein-Sieg-Kreis	50.7527986	7.3813038 | ||||
| polizei_nrw_un	Polizei NRW UN	Polizei	Nordrhein-Westfalen	Unna	51.5348835	7.689014 | ||||
| polizei_nrw_vie	Polizei NRW VIE	Polizei	Nordrhein-Westfalen	Viersen	51.2562118	6.3905476 | ||||
| polizei_nrw_w	Polizei NRW W	Polizei	Nordrhein-Westfalen	Wuppertal	51.264018	7.1780374 | ||||
| polizei_nrw_waf	Polizei NRW WAF	Polizei	Nordrhein-Westfalen	Warendorf	51.9532449	7.9912335 | ||||
| polizei_nrw_wes	Polizei NRW WES	Polizei	Nordrhein-Westfalen	Wesel	51.6576909	6.617087 | ||||
| polizeiobn	Polizei Oberbayern N	Polizei	Bayern	Ingolstadt	48.7630165	11.4250395 | ||||
| polizeiobs	PolizeiOberbayernSüd	Polizei	Bayern	Rosenheim	47.8539273	12.127262 | ||||
| polizeiofr	Polizei Oberfranken	Polizei	Bayern	Oberfranken	50.0553084	11.5455233 | ||||
| polizeiog	Polizei Offenburg	Polizei	Baden-Württemberg	Offenburg	48.4716556	7.944378 | ||||
| polizei_oh	Polizei Osthessen	Polizei	Hessen	Fulda	50.5521486	9.676511 | ||||
| polizei_oha	Polizei Osterode	Polizei	Niedersachsen	Osterode am Harz	51.72784	10.2508204 | ||||
| polizei_ol	Polizei Oldenburg-Stadt/Ammerland	Polizei	Niedersachsen	Oldenburg	53.1389753	8.2146017 | ||||
| polizeiopf	Polizei Oberpfalz	Polizei	Bayern	Regensburg	49.0195333	12.0974869 | ||||
| polizei_os	Polizei Osnabrück	Polizei	Niedersachsen	Osnabrück	52.266837	8.049741 | ||||
| polizei_pf	Polizei Pforzheim	Polizei	Baden-Württemberg	Pforzheim	48.8908846	8.7029532 | ||||
| polizei_pp_nb	Polizeipräsidium NB	Polizeipräsidium	Mecklenburg-Vorpommern	Neubrandenburg	53.5574458	13.2602781 | ||||
| polizei_pp_ros	Polizeipräsidium Rostock	Polizeipräsidium	Mecklenburg-Vorpommern	Rostock	54.0924445	12.1286127 | ||||
| polizei_ps	Polizei Pirmasens	Polizei	Rheinland-Pfalz	Pirmasens	49.1996961	7.6087847 | ||||
| polizei_rostock	Polizei Rostock	Polizei	Mecklenburg-Vorpommern	Rostock	54.0924445	12.1286127 | ||||
| polizei_row	Polizei Rotenburg	Polizei	Niedersachsen	Rotenburg (Wümme)	53.2520924	9.3151133 | ||||
| polizeirt	Polizei Reutlingen	Polizei	Baden-Württemberg	Reutlingen	48.4919508	9.2114144 | ||||
| polizeirv	Polizei Ravensburg	Polizei	Baden-Württemberg	Ravensburg	47.7811014	9.612468 | ||||
| polizeisaarland	Polizei Saarland	Polizei	Saarland	Saarbrücken	49.234362	6.996379 | ||||
| polizeisachsen	Polizei Sachsen	Polizei	Sachsen	Dresden	51.0493286	13.7381437 | ||||
| polizei_sdl	Polizei Stendal	Polizei	Sachsen-Anhalt	Stendal	52.6050782	11.8594279 | ||||
| polizei_sn	Polizei Schwerin	Polizei	Mecklenburg-Vorpommern	Schwerin	53.6288297	11.4148038 | ||||
| polizei_soh	Polizei Südosthessen	Polizei	Hessen	Offenbach am Main	50.1055002	8.7610698 | ||||
| polizei_std	Polizei Stade	Polizei	Niedersachsen	Stade	53.599794	9.475438 | ||||
| polizei_sth	Polizei Stadthagen	Polizei	Niedersachsen	Stadthagen	52.3289688	9.2053496 | ||||
| polizei_suedhe	Polizei Südhessen	Polizei	Hessen	Darmstadt	49.872775	8.651177 | ||||
| polizeiswn	Polizei Schwaben Nord	Polizei	Bayern	Augsburg	48.3668041	10.8986971 | ||||
| polizeisws	Polizei Schwaben S/W	Polizei	Bayern	Kempten (Allgäu)	47.7267063	10.3168835 | ||||
| polizei_sz	Polizei SZ / PE / WF	Polizei	Niedersachsen	Salzgitter	52.1503721	10.3593147 | ||||
| polizei_thuer	Polizei Thüringen	Polizei	Thüringen	Erfurt	50.9777974	11.0287364 | ||||
| polizeitrier	Polizei Trier	Polizei	Rheinland-Pfalz	Trier	49.7596208	6.6441878 | ||||
| polizeiufr	Polizei Unterfranken	Polizei	Bayern	Würzburg	49.79245	9.932966 | ||||
| polizeiul	Polizei Ulm	Polizei	Baden-Württemberg	Ulm	48.3974003	9.9934336 | ||||
| polizei_ver_ohz	Polizei Verden/Osterholz	Polizei	Niedersachsen	Verden	52.922341	9.228153 | ||||
| polizeivg	Polizei Vorpommern-Greifswald	Polizei	Mecklenburg-Vorpommern	Anklam	53.8560526	13.688091 | ||||
| polizei_wh	Polizei Westhessen	Polizei	Hessen	Wiesbaden	50.0820384	8.2416556 | ||||
| polizei_whv_fri	Polizei Wilhelmshaven/Friesland	Polizei	Niedersachsen	Wilhelmshaven	53.5278793	8.106301 | ||||
| polizeiwittlich	Polizei Wittlich	Polizei	Rheinland-Pfalz	Wittlich	49.9850353	6.88844 | ||||
| polizei_wl	Polizei LK Harburg	Polizei	Niedersachsen	Harburg	53.3172237	9.9084936 | ||||
| polizei_wob	Polizei Wolfsburg	Polizei	Niedersachsen	Wolfsburg	52.4205588	10.7861682 | ||||
| polizei_zpd_ni	Polizei ZPD NI	Polizei	Niedersachsen	Hannover	52.3744779	9.7385532 | ||||
| pp_rheinpfalz	Polizei Rheinpfalz	Polizei	Rheinland-Pfalz	Ludwigshafen am Rhein	49.4704113	8.4381568 | ||||
| pp_stuttgart	Polizei Stuttgart	Polizei	Baden-Württemberg	Stuttgart	48.7784485	9.1800132 | ||||
| sh_polizei	Polizei SH	Polizei	Schleswig-Holstein	Kiel	54.3227085	10.135555 | ||||
| 
 | 
							
								
								
									
										39
									
								
								ergebnisse_hackathon_repo/team-16/keyword_search.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								ergebnisse_hackathon_repo/team-16/keyword_search.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,39 @@ | |||
| import pandas as pd | ||||
| import spacy | ||||
| from string import punctuation | ||||
| from tqdm import tqdm | ||||
| tqdm.pandas() | ||||
| 
 | ||||
| 
 | ||||
| tw_tweets = pd.read_csv(r'data\copbird_table_tweet_ext_state.csv') | ||||
| 
 | ||||
| nlp = spacy.load('de_core_news_lg') | ||||
| 
 | ||||
| 
 | ||||
| def clean_tweet(txt): | ||||
|     doc = nlp(txt) | ||||
|     token_list = [] | ||||
| 
 | ||||
|     for token in doc: | ||||
|         if (token.text not in punctuation) and (token.is_stop is False): | ||||
|             token_list.append(token.lemma_) | ||||
|         else: | ||||
|             pass | ||||
|     return ' '.join(token_list) | ||||
| 
 | ||||
| 
 | ||||
| def get_topics_by_str_lst(topic, df, col_name): | ||||
|     df_topiced = df[df[col_name].str.contains('|'.join(topic))] | ||||
|     return df_topiced | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     topic_1 = ['demonstr', 'kundgeb'] | ||||
|     topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal'] | ||||
|     topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass'] | ||||
|     topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust'] | ||||
| 
 | ||||
|     df_pm = pd.read_csv(r'data\2020-12_2021-05_presseportal.csv', na_filter=False) | ||||
|     df_pm_col = 'content' | ||||
| 
 | ||||
|     print(get_topics_by_str_lst(topic=topic_3, df=df_pm, col_name=df_pm_col).to_markdown()) | ||||
|  | @ -0,0 +1,394 @@ | |||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "id": "7158ac22", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import matplotlib.pyplot as plt\n", | ||||
|     "import pyLDAvis.sklearn\n", | ||||
|     "from sklearn.decomposition import LatentDirichletAllocation\n", | ||||
|     "from sklearn.feature_extraction.text import CountVectorizer\n", | ||||
|     "import pandas as pd\n", | ||||
|     "import spacy\n", | ||||
|     "from multiprocess import Pool" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "69f33a46", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Funktionen zur Vorverarbeitung" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "id": "1c66c06c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def filterSentencesByMinWordCount(text, minWordCount):\n", | ||||
|     "    sentenceList = []\n", | ||||
|     "    doc = nlp(text)\n", | ||||
|     "    for sent in doc.sents:\n", | ||||
|     "        wordList = []\n", | ||||
|     "        sent.text.rstrip()\n", | ||||
|     "        for word in sent:\n", | ||||
|     "            wordList.append(word)\n", | ||||
|     "        if len(wordList) >= minWordCount:\n", | ||||
|     "            sentenceList.append(sent.text.rstrip())\n", | ||||
|     "    return sentenceList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "id": "3b9d084d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def extractSentences(document):\n", | ||||
|     "    logging.debug('Extracting Sentences')\n", | ||||
|     "    text = extractBodyContent(document)\n", | ||||
|     "    sentenceList = filterSentencesByMinWordCount(text, 4)\n", | ||||
|     "    return sentenceList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "id": "7d85891e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def tokenizeSentence(doc):\n", | ||||
|     "    logging.debug('Tokenizing')\n", | ||||
|     "    tokenList = []\n", | ||||
|     "    for token in doc:\n", | ||||
|     "        childrenList = []\n", | ||||
|     "        for child in token.children:\n", | ||||
|     "            childToken = ScToken(child.text,\n", | ||||
|     "                                 child.lemma_,\n", | ||||
|     "                                 child.pos_, str(spacy.explain(child.pos_)),\n", | ||||
|     "                                 child.tag_, str(spacy.explain(child.tag_)),\n", | ||||
|     "                                 child.dep_, str(spacy.explain(child.dep_)),\n", | ||||
|     "                                 child.shape_, child.is_alpha, child.is_stop)\n", | ||||
|     "            childrenList.append(childToken)\n", | ||||
|     "\n", | ||||
|     "        scToken = ScToken(token.text,\n", | ||||
|     "                          token.lemma_,\n", | ||||
|     "                          token.pos_, str(spacy.explain(token.pos_)),\n", | ||||
|     "                          token.tag_, str(spacy.explain(token.tag_)),\n", | ||||
|     "                          token.dep_, str(spacy.explain(token.dep_)),\n", | ||||
|     "                          token.shape_, token.is_alpha, token.is_stop,\n", | ||||
|     "                          childrenList)\n", | ||||
|     "        tokenList.append(scToken)\n", | ||||
|     "    return tokenList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 9, | ||||
|    "id": "7564c883", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def chunkSentence(doc):\n", | ||||
|     "    logging.debug('Chunking')\n", | ||||
|     "    chunkList = []\n", | ||||
|     "    for chunk in doc.noun_chunks:\n", | ||||
|     "        scChunk = ScChunk(chunk.text, chunk.root.text,\n", | ||||
|     "                          chunk.root.dep_, chunk.root.head.text)\n", | ||||
|     "        chunkList.append(scChunk)\n", | ||||
|     "    return chunkList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 10, | ||||
|    "id": "5db74302", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def findEntitiesInSentence(doc):\n", | ||||
|     "    logging.debug('Extracting Named Entities')\n", | ||||
|     "    entityList = []\n", | ||||
|     "    for ent in doc.ents:\n", | ||||
|     "        entity = ScEntity(ent.text, ent.start_char, ent.end_char,\n", | ||||
|     "                          ent.label_, str(spacy.explain(ent.label_)))\n", | ||||
|     "        entityList.append(entity)\n", | ||||
|     "    return entityList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 11, | ||||
|    "id": "b6753a90", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def fillSentences(document):\n", | ||||
|     "    logging.info(\n", | ||||
|     "        'Building Sentences (Tokenizing, Chunking, Named Entity Recognition)')\n", | ||||
|     "    sentenceList = []\n", | ||||
|     "    sentences = extractSentences(document)\n", | ||||
|     "    for i, sentence in enumerate(sentences):\n", | ||||
|     "        doc = nlp(sentence)\n", | ||||
|     "        id = i\n", | ||||
|     "        tokens = tokenizeSentence(doc)\n", | ||||
|     "        chunks = chunkSentence(doc)\n", | ||||
|     "        entities = findEntitiesInSentence(doc)\n", | ||||
|     "        scSentence = ScSentence(id, sentence, tokens, chunks, entities)\n", | ||||
|     "        sentenceList.append(scSentence)\n", | ||||
|     "\n", | ||||
|     "    return sentenceList" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 12, | ||||
|    "id": "9af7a5c0", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def fillSentencesAsOneString(sentences):\n", | ||||
|     "    sentencesAsOneString = str()\n", | ||||
|     "    for sentence in sentences:\n", | ||||
|     "        sentencesAsOneString += sentence.text\n", | ||||
|     "    return sentencesAsOneString" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 13, | ||||
|    "id": "8f952c82", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def removeStopwords(text):\n", | ||||
|     "    doc = nlp(text)\n", | ||||
|     "    tokens = tokenizeSentence(doc)\n", | ||||
|     "    chunksNoStopwords = [\n", | ||||
|     "        t.text for t in tokens if (not t.isStopword)]\n", | ||||
|     "    return \" \".join(chunksNoStopwords)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 14, | ||||
|    "id": "28910141", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "ename": "NameError", | ||||
|      "evalue": "name 'ScToken' is not defined", | ||||
|      "output_type": "error", | ||||
|      "traceback": [ | ||||
|       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | ||||
|       "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)", | ||||
|       "\u001b[1;32m<ipython-input-14-7209b5cec518>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mnumberOfStopwords\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mScToken\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m     \u001b[0mcount\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misStopword\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m             \u001b[0mcount\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||||
|       "\u001b[1;31mNameError\u001b[0m: name 'ScToken' is not defined" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def numberOfStopwords(tokens: [ScToken]):\n", | ||||
|     "    count = 0\n", | ||||
|     "    for t in tokens:\n", | ||||
|     "        if t.isStopword:\n", | ||||
|     "            count += 1\n", | ||||
|     "    return count" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "ce7cc9c1", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Analyse" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 15, | ||||
|    "id": "1adb09b7", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     " tweet_csv = '../data/copbird_table_tweet.csv'" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 18, | ||||
|    "id": "c0936ecb", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "NUM_TOPICS=10\n", | ||||
|     "NUM_FEATURES=1000\n", | ||||
|     "NUM_TOP_WORDS=25" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 16, | ||||
|    "id": "dbf0281f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "c:\\users\\chris\\pycharmprojects\\copbird-group-16\\copbird-venv\\lib\\site-packages\\ipykernel\\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", | ||||
|       "  and should_run_async(code)\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "def get_tweets(path, limit=None):\n", | ||||
|     "    df_csv = pd.read_csv(path, nrows=limit, parse_dates=['created_at'],\n", | ||||
|     "                         encoding='utf-8-sig')\n", | ||||
|     "\n", | ||||
|     "    df_csv.drop(columns=['created_at', 'like_count', 'retweet_count', 'reply_count', 'quote_count'], inplace=True)\n", | ||||
|     "\n", | ||||
|     "    nlp = spacy.load(\"de_core_news_lg\")\n", | ||||
|     "    nlp.Defaults.stop_words |= {\"&\", \"amp\"}\n", | ||||
|     "    nlp.add_pipe('emoji', first=True)\n", | ||||
|     "    return list(\n", | ||||
|     "        nlp.pipe(df_csv['tweet_text'], disable=[\"tok2vec\", \"tagger\", \"parser\", \"attribute_ruler\"], n_process=-1))" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "2c39f658", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "copbird-env", | ||||
|    "language": "python", | ||||
|    "name": "copbird-env" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.8.5" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -0,0 +1,490 @@ | |||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "cce66876", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Interface Presseportal" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f12d7022", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "b07aef9f", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "258338d0", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 1, | ||||
|    "id": "b07fac3c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import requests\n", | ||||
|     "import calendar\n", | ||||
|     "import time\n", | ||||
|     "import os\n", | ||||
|     "import csv\n", | ||||
|     "\n", | ||||
|     "from tqdm.notebook import tqdm\n", | ||||
|     "from datetime import datetime\n", | ||||
|     "from bs4 import BeautifulSoup" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "0dfce15a", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "id": "6c0b30a8", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "class Pressemitteilung:\n", | ||||
|     "    def __init__(self, article_id, timestamp, location, text, bundesland):\n", | ||||
|     "        self.article_id = article_id\n", | ||||
|     "        self.timestamp = timestamp\n", | ||||
|     "        self.location = location\n", | ||||
|     "        self.text = text\n", | ||||
|     "        self.bundesland=bundesland\n", | ||||
|     "    \n", | ||||
|     "    def __str__(self):\n", | ||||
|     "        return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n", | ||||
|     "    \n", | ||||
|     "    def to_row(self):\n", | ||||
|     "        return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "63cceebe", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "**Konstanten und Pfade**" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "id": "8bcc877f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "REQUEST_HEADERS = {\n", | ||||
|     "    \"User-Agent\": (\n", | ||||
|     "        \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n", | ||||
|     "        \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n", | ||||
|     "    )\n", | ||||
|     "}" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "id": "c637ac38", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "DATA_FOLDER = os.path.join(\"..\", \"data\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 5, | ||||
|    "id": "f094dee0", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "BUNDESLAENDER = [\n", | ||||
|     "    \"baden-wuerttemberg\",\n", | ||||
|     "    \"bayern\",\n", | ||||
|     "    \"berlin-brandenburg\",\n", | ||||
|     "    \"bremen\",\n", | ||||
|     "    \"hamburg\",\n", | ||||
|     "    \"hessen\",\n", | ||||
|     "    \"mecklenburg-vorpommern\",\n", | ||||
|     "    \"niedersachsen\",\n", | ||||
|     "    \"nordrhein-westfalen\",\n", | ||||
|     "    \"rheinland-pfalz\",\n", | ||||
|     "    \"saarland\",\n", | ||||
|     "    \"sachsen\",\n", | ||||
|     "    \"sachsen-anhalt\",\n", | ||||
|     "    \"schleswig-holstein\",\n", | ||||
|     "    \"thueringen\",\n", | ||||
|     "]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "id": "84632391", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def requests_get(request):\n", | ||||
|     "    return requests.get(request, headers=REQUEST_HEADERS)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "id": "1af0bdbd", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def extract_response(response, bundesland=None):\n", | ||||
|     "    \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        response (:obj:`Response`)\n", | ||||
|     "        bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n", | ||||
|     "    \n", | ||||
|     "    Returns:\n", | ||||
|     "        list of :obj:`Pressemitteilung`\n", | ||||
|     "    \"\"\"\n", | ||||
|     "    \n", | ||||
|     "    mitteilungen = []\n", | ||||
|     "    \n", | ||||
|     "    soup = BeautifulSoup(response.content, 'html.parser')\n", | ||||
|     "    for article in soup.find_all('article'):\n", | ||||
|     "        data_url = article['data-url']\n", | ||||
|     "        article_id = '-'.join(article['data-url'].split('/')[-2:])\n", | ||||
|     "        meta = article.find('div')\n", | ||||
|     "        \n", | ||||
|     "        timestamp_str = meta.find(class_=\"date\")\n", | ||||
|     "        \n", | ||||
|     "        if timestamp_str is not None:\n", | ||||
|     "            timestamp_str = timestamp_str.text\n", | ||||
|     "            timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y – %H:%M')\n", | ||||
|     "        else:\n", | ||||
|     "            timestamp = None\n", | ||||
|     "        \n", | ||||
|     "        location_str = meta.find(class_=\"news-topic\")\n", | ||||
|     "        location_str = location_str.text if location_str is not None else None\n", | ||||
|     "        \n", | ||||
|     "        p_texts = article.findAll('p')\n", | ||||
|     "        if len(p_texts) > 1:\n", | ||||
|     "            text = p_texts[1].text\n", | ||||
|     "        else:\n", | ||||
|     "            text = ''\n", | ||||
|     "        \n", | ||||
|     "        mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n", | ||||
|     "    \n", | ||||
|     "    return mitteilungen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "id": "c62c06c9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n", | ||||
|     "    \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n", | ||||
|     "        location (:obj:`str`, default=None): Bundesland bzw. Stadt\n", | ||||
|     "        start_date (:obj:`str`, default=None)\n", | ||||
|     "        end_date (:obj:`str`, default=None)\n", | ||||
|     "    Returns:\n", | ||||
|     "        str: URL\n", | ||||
|     "    \"\"\"\n", | ||||
|     "    url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n", | ||||
|     "    \n", | ||||
|     "    if location is not None:\n", | ||||
|     "        url += f\"/l/{location}\"\n", | ||||
|     "    \n", | ||||
|     "    if site > 1:\n", | ||||
|     "        url += f\"/{site*30}\"\n", | ||||
|     "    \n", | ||||
|     "    if start_date is not None or end_date is not None:\n", | ||||
|     "        url += \"?\"\n", | ||||
|     "    \n", | ||||
|     "        if start_date is not None:\n", | ||||
|     "            url += f\"startDate={start_date}\"\n", | ||||
|     "        \n", | ||||
|     "            if end_date is not None:\n", | ||||
|     "                url += \"&\"\n", | ||||
|     "        \n", | ||||
|     "        if end_date is not None:\n", | ||||
|     "            url += f\"endDate={end_date}\"\n", | ||||
|     "    \n", | ||||
|     "    return url" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "1c67c9bc", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Beispiel: Hamburg " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 9, | ||||
|    "id": "aff924d6", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "data": { | ||||
|       "text/plain": [ | ||||
|        "'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 9, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n", | ||||
|     "url" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 10, | ||||
|    "id": "6e2b9091", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n", | ||||
|       "[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n", | ||||
|       "[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n", | ||||
|       "[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n", | ||||
|       "[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "for mitteilung in extract_response(requests_get(url))[:5]:\n", | ||||
|     "    print(mitteilung)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "e50af557", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Effizientes Einlesen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "b4a9580a", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 11, | ||||
|    "id": "da927e30", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n", | ||||
|     "    \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n", | ||||
|     "\n", | ||||
|     "    meldungen = []\n", | ||||
|     "    site = 1\n", | ||||
|     "    \n", | ||||
|     "    start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n", | ||||
|     "    end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n", | ||||
|     "    request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n", | ||||
|     "    \n", | ||||
|     "    new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n", | ||||
|     "    meldungen.extend(new_meldungen)\n", | ||||
|     "    \n", | ||||
|     "    pbar = tqdm(desc=bundesland)\n", | ||||
|     "    while len(new_meldungen) != 0:\n", | ||||
|     "        time.sleep(1)\n", | ||||
|     "        site += 1\n", | ||||
|     "        \n", | ||||
|     "        request = create_get_request(\n", | ||||
|     "            site=site, location=bundesland, start_date=start_date, end_date=end_date,\n", | ||||
|     "        )\n", | ||||
|     "        \n", | ||||
|     "        new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n", | ||||
|     "        meldungen.extend(new_meldungen)\n", | ||||
|     "        pbar.update(1)\n", | ||||
|     "    pbar.close()\n", | ||||
|     "        \n", | ||||
|     "    return meldungen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 12, | ||||
|    "id": "85508758", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def get_meldungen_for_date(year, month, day):\n", | ||||
|     "    \"\"\"Extrahiere alle Meldungen für einen Tag\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        year (int): Jahr\n", | ||||
|     "        month (int): Monat\n", | ||||
|     "        day (int): Tag\n", | ||||
|     "    \"\"\"\n", | ||||
|     "\n", | ||||
|     "    meldungen_dict = {}\n", | ||||
|     "    \n", | ||||
|     "    for bundesland in BUNDESLAENDER:\n", | ||||
|     "        meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n", | ||||
|     "        meldungen_dict[bundesland] = meldungen\n", | ||||
|     "    \n", | ||||
|     "    return meldungen_dict" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f938d8a9", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Speichern der Daten in CSV-Dateien" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "67374d3b", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 13, | ||||
|    "id": "276e700d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def store_meldungen_in_csv(year, month, day):\n", | ||||
|     "    \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n", | ||||
|     "\n", | ||||
|     "    filename = f\"{year}-{month}-{day}_presseportal.csv\"\n", | ||||
|     "    path = os.path.join(DATA_FOLDER, filename)\n", | ||||
|     "    meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n", | ||||
|     "    \n", | ||||
|     "    with open(path, 'w', newline='', encoding='UTF8') as f:\n", | ||||
|     "        writer = csv.writer(f)\n", | ||||
|     "        writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n", | ||||
|     "        \n", | ||||
|     "        for bundesland, meldungen in meldungen_per_bundesland.items():\n", | ||||
|     "            for meldung in meldungen:\n", | ||||
|     "                writer.writerow(meldung.to_row())\n", | ||||
|     "    \n", | ||||
|     "    print(f\"File '{filename}' created\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 14, | ||||
|    "id": "c5d0bdbd", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def store_month(year, month):\n", | ||||
|     "    month_end_day = calendar.monthrange(year, month)[1]\n", | ||||
|     "    \n", | ||||
|     "    for i in range(0, month_end_day):\n", | ||||
|     "        store_meldungen_in_csv(year, month, i+1)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "d9f3e24b", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Auswertung: Wie viele Einträge pro Bundesland?" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "9f600d3c", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 51, | ||||
|    "id": "b7c85078", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "counter = {}\n", | ||||
|     "\n", | ||||
|     "for filename in os.listdir('../data/'):\n", | ||||
|     "    if filename.endswith(\"_presseportal.csv\"):\n", | ||||
|     "        path = '../data/' + filename\n", | ||||
|     "        \n", | ||||
|     "        with open(path, 'r', encoding='UTF8') as f_in:\n", | ||||
|     "            reader = csv.reader(f_in)\n", | ||||
|     "            next(reader)\n", | ||||
|     "            for row in reader:\n", | ||||
|     "                bundesland = row[3]\n", | ||||
|     "                if bundesland not in counter:\n", | ||||
|     "                    counter[bundesland] = 1\n", | ||||
|     "                else:\n", | ||||
|     "                    counter[bundesland] += 1\n" | ||||
|    ] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "copbird-env", | ||||
|    "language": "python", | ||||
|    "name": "copbird-env" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.8.5" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										1058
									
								
								ergebnisse_hackathon_repo/team-16/notebooks/keywords-tweets.ipynb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1058
									
								
								ergebnisse_hackathon_repo/team-16/notebooks/keywords-tweets.ipynb
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -0,0 +1,490 @@ | |||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "cce66876", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Interface Presseportal" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f12d7022", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Das Presseportal bietet eine Platform, bei der mittels GET-requests die Pressemitteilungen verschiedener Institutionen (Polizei, Feuerwehr, ...), in bestimmten Zeiträumen in gegebenen Gebieten extrahiert werden können. Dafür gibt es auch eine API." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "b07aef9f", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Beispiel URL: `https://www.presseportal.de/blaulicht/d/polizei/l/hessen/30?startDate=2021-05-04&endDate=2021-05-04`" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "258338d0", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Da eine große Menge an Tweets angefragt werden und Requests ziemlich lange benötigen, muss die Anfrage optimiert werden:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 1, | ||||
|    "id": "b07fac3c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import requests\n", | ||||
|     "import calendar\n", | ||||
|     "import time\n", | ||||
|     "import os\n", | ||||
|     "import csv\n", | ||||
|     "\n", | ||||
|     "from tqdm.notebook import tqdm\n", | ||||
|     "from datetime import datetime\n", | ||||
|     "from bs4 import BeautifulSoup" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "0dfce15a", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Um Pressemitteilungen sinnvoll zu speichern, werden sie als Klasse dargestellt:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "id": "6c0b30a8", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "class Pressemitteilung:\n", | ||||
|     "    def __init__(self, article_id, timestamp, location, text, bundesland):\n", | ||||
|     "        self.article_id = article_id\n", | ||||
|     "        self.timestamp = timestamp\n", | ||||
|     "        self.location = location\n", | ||||
|     "        self.text = text\n", | ||||
|     "        self.bundesland=bundesland\n", | ||||
|     "    \n", | ||||
|     "    def __str__(self):\n", | ||||
|     "        return f\"[{self.article_id}] {self.timestamp} {self.location} | {' '.join(self.text.split()[:6])}\"\n", | ||||
|     "    \n", | ||||
|     "    def to_row(self):\n", | ||||
|     "        return [self.article_id, self.timestamp, self.location, self.bundesland, self.text]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "63cceebe", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "**Konstanten und Pfade**" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "id": "8bcc877f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "REQUEST_HEADERS = {\n", | ||||
|     "    \"User-Agent\": (\n", | ||||
|     "        \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 \"\n", | ||||
|     "        \"(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\"\n", | ||||
|     "    )\n", | ||||
|     "}" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "id": "c637ac38", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "DATA_FOLDER = os.path.join(\"..\", \"data\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 5, | ||||
|    "id": "f094dee0", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "BUNDESLAENDER = [\n", | ||||
|     "    \"baden-wuerttemberg\",\n", | ||||
|     "    \"bayern\",\n", | ||||
|     "    \"berlin-brandenburg\",\n", | ||||
|     "    \"bremen\",\n", | ||||
|     "    \"hamburg\",\n", | ||||
|     "    \"hessen\",\n", | ||||
|     "    \"mecklenburg-vorpommern\",\n", | ||||
|     "    \"niedersachsen\",\n", | ||||
|     "    \"nordrhein-westfalen\",\n", | ||||
|     "    \"rheinland-pfalz\",\n", | ||||
|     "    \"saarland\",\n", | ||||
|     "    \"sachsen\",\n", | ||||
|     "    \"sachsen-anhalt\",\n", | ||||
|     "    \"schleswig-holstein\",\n", | ||||
|     "    \"thueringen\",\n", | ||||
|     "]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "id": "84632391", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def requests_get(request):\n", | ||||
|     "    return requests.get(request, headers=REQUEST_HEADERS)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "id": "1af0bdbd", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def extract_response(response, bundesland=None):\n", | ||||
|     "    \"\"\"Extrahiere aus der Response einer Request alle Pressemitteilungen\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        response (:obj:`Response`)\n", | ||||
|     "        bundesland (:obj:`str`): Kann mit angegeben, falls es in der Suche relevant war. Default = None\n", | ||||
|     "    \n", | ||||
|     "    Returns:\n", | ||||
|     "        list of :obj:`Pressemitteilung`\n", | ||||
|     "    \"\"\"\n", | ||||
|     "    \n", | ||||
|     "    mitteilungen = []\n", | ||||
|     "    \n", | ||||
|     "    soup = BeautifulSoup(response.content, 'html.parser')\n", | ||||
|     "    for article in soup.find_all('article'):\n", | ||||
|     "        data_url = article['data-url']\n", | ||||
|     "        article_id = '-'.join(article['data-url'].split('/')[-2:])\n", | ||||
|     "        meta = article.find('div')\n", | ||||
|     "        \n", | ||||
|     "        timestamp_str = meta.find(class_=\"date\")\n", | ||||
|     "        \n", | ||||
|     "        if timestamp_str is not None:\n", | ||||
|     "            timestamp_str = timestamp_str.text\n", | ||||
|     "            timestamp = datetime.strptime(timestamp_str, '%d.%m.%Y – %H:%M')\n", | ||||
|     "        else:\n", | ||||
|     "            timestamp = None\n", | ||||
|     "        \n", | ||||
|     "        location_str = meta.find(class_=\"news-topic\")\n", | ||||
|     "        location_str = location_str.text if location_str is not None else None\n", | ||||
|     "        \n", | ||||
|     "        p_texts = article.findAll('p')\n", | ||||
|     "        if len(p_texts) > 1:\n", | ||||
|     "            text = p_texts[1].text\n", | ||||
|     "        else:\n", | ||||
|     "            text = ''\n", | ||||
|     "        \n", | ||||
|     "        mitteilungen.append(Pressemitteilung(article_id, timestamp, location_str, text, bundesland))\n", | ||||
|     "    \n", | ||||
|     "    return mitteilungen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "id": "c62c06c9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def create_get_request(*, site=1, location=None, start_date=None, end_date=None):\n", | ||||
|     "    \"\"\"Simulation einer API: Erzeuge aus Parametern eine URL\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        site (int, default=1): Aktuelle Seite, auf der man sich befinden soll. Ist in der URL in 30er Schritten angegeben\n", | ||||
|     "        location (:obj:`str`, default=None): Bundesland bzw. Stadt\n", | ||||
|     "        start_date (:obj:`str`, default=None)\n", | ||||
|     "        end_date (:obj:`str`, default=None)\n", | ||||
|     "    Returns:\n", | ||||
|     "        str: URL\n", | ||||
|     "    \"\"\"\n", | ||||
|     "    url = f\"https://www.presseportal.de/blaulicht/d/polizei\"\n", | ||||
|     "    \n", | ||||
|     "    if location is not None:\n", | ||||
|     "        url += f\"/l/{location}\"\n", | ||||
|     "    \n", | ||||
|     "    if site > 1:\n", | ||||
|     "        url += f\"/{site*30}\"\n", | ||||
|     "    \n", | ||||
|     "    if start_date is not None or end_date is not None:\n", | ||||
|     "        url += \"?\"\n", | ||||
|     "    \n", | ||||
|     "        if start_date is not None:\n", | ||||
|     "            url += f\"startDate={start_date}\"\n", | ||||
|     "        \n", | ||||
|     "            if end_date is not None:\n", | ||||
|     "                url += \"&\"\n", | ||||
|     "        \n", | ||||
|     "        if end_date is not None:\n", | ||||
|     "            url += f\"endDate={end_date}\"\n", | ||||
|     "    \n", | ||||
|     "    return url" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "1c67c9bc", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Beispiel: Hamburg " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 9, | ||||
|    "id": "aff924d6", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "data": { | ||||
|       "text/plain": [ | ||||
|        "'https://www.presseportal.de/blaulicht/d/polizei/l/hamburg/90?startDate=2021-01-13&endDate=2021-03-20'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 9, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "url = create_get_request(location=\"hamburg\", site=3, start_date=\"2021-01-13\", end_date=\"2021-03-20\")\n", | ||||
|     "url" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 10, | ||||
|    "id": "6e2b9091", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "[6337-4840243] 2021-02-16 17:41:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 08:15\n", | ||||
|       "[6337-4839937] 2021-02-16 13:14:00 Hamburg | Hamburg (ots) - Tatzeiten: a. 15.02.2021,\n", | ||||
|       "[6337-4839709] 2021-02-16 11:33:00 Hamburg | Hamburg (ots) - Tatzeit: 15.02.2021, 18:25\n", | ||||
|       "[6337-4839544] 2021-02-16 10:31:00 Hamburg | Hamburg (ots) - Zeit: 15.02.2021, 01:34\n", | ||||
|       "[6337-4838489] 2021-02-15 11:48:00 Hamburg | Hamburg (ots) - Tatzeit: 14.02.2021; 19:17\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "for mitteilung in extract_response(requests_get(url))[:5]:\n", | ||||
|     "    print(mitteilung)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "e50af557", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Effizientes Einlesen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "b4a9580a", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Um die Dateien sinnhaft zu extrahieren, ohne auf einen Schlag zu viele Anfragen zu tätigen, läuft das Programm synchron mit Pausen (1Sek / Anfrage). Die Hauptfunktion sucht für einen gegebenen Tag alle Pressemeldungen der Polizei und sortiert diese nach Bundesland bzw. Stadt." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 11, | ||||
|    "id": "da927e30", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def _get_meldungen_for_date_and_bundesland(year, month, day, bundesland):\n", | ||||
|     "    \"\"\"Suche alle Meldungen für ein Bundesland zu einem konkreten Tag\"\"\"\n", | ||||
|     "\n", | ||||
|     "    meldungen = []\n", | ||||
|     "    site = 1\n", | ||||
|     "    \n", | ||||
|     "    start_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n", | ||||
|     "    end_date = datetime(year, month, day).strftime(\"%Y-%m-%d\")\n", | ||||
|     "    request = create_get_request(site=site, location=bundesland, start_date=start_date, end_date=end_date)\n", | ||||
|     "    \n", | ||||
|     "    new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n", | ||||
|     "    meldungen.extend(new_meldungen)\n", | ||||
|     "    \n", | ||||
|     "    pbar = tqdm(desc=bundesland)\n", | ||||
|     "    while len(new_meldungen) != 0:\n", | ||||
|     "        time.sleep(1)\n", | ||||
|     "        site += 1\n", | ||||
|     "        \n", | ||||
|     "        request = create_get_request(\n", | ||||
|     "            site=site, location=bundesland, start_date=start_date, end_date=end_date,\n", | ||||
|     "        )\n", | ||||
|     "        \n", | ||||
|     "        new_meldungen = extract_response(requests_get(request), bundesland=bundesland)\n", | ||||
|     "        meldungen.extend(new_meldungen)\n", | ||||
|     "        pbar.update(1)\n", | ||||
|     "    pbar.close()\n", | ||||
|     "        \n", | ||||
|     "    return meldungen" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 12, | ||||
|    "id": "85508758", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def get_meldungen_for_date(year, month, day):\n", | ||||
|     "    \"\"\"Extrahiere alle Meldungen für einen Tag\n", | ||||
|     "    \n", | ||||
|     "    Args:\n", | ||||
|     "        year (int): Jahr\n", | ||||
|     "        month (int): Monat\n", | ||||
|     "        day (int): Tag\n", | ||||
|     "    \"\"\"\n", | ||||
|     "\n", | ||||
|     "    meldungen_dict = {}\n", | ||||
|     "    \n", | ||||
|     "    for bundesland in BUNDESLAENDER:\n", | ||||
|     "        meldungen = _get_meldungen_for_date_and_bundesland(year, month, day, bundesland)\n", | ||||
|     "        meldungen_dict[bundesland] = meldungen\n", | ||||
|     "    \n", | ||||
|     "    return meldungen_dict" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f938d8a9", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Speichern der Daten in CSV-Dateien" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "67374d3b", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Zur sinnvollen Speicherung werden alle Daten eines Tages in genau einer CSV-Datei gespeichert. Diese können danach (manuell) als ZIP des Monats zusammengefasst werden. " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 13, | ||||
|    "id": "276e700d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def store_meldungen_in_csv(year, month, day):\n", | ||||
|     "    \"\"\"Speichere alle Meldungen für ein Datum in einer CSV. Im Namen der CSV steht das Datum.\"\"\"\n", | ||||
|     "\n", | ||||
|     "    filename = f\"{year}-{month}-{day}_presseportal.csv\"\n", | ||||
|     "    path = os.path.join(DATA_FOLDER, filename)\n", | ||||
|     "    meldungen_per_bundesland = get_meldungen_for_date(year, month, day)\n", | ||||
|     "    \n", | ||||
|     "    with open(path, 'w', newline='', encoding='UTF8') as f:\n", | ||||
|     "        writer = csv.writer(f)\n", | ||||
|     "        writer.writerow(['article_id', 'timestamp', 'location', 'bundesland', 'content'])\n", | ||||
|     "        \n", | ||||
|     "        for bundesland, meldungen in meldungen_per_bundesland.items():\n", | ||||
|     "            for meldung in meldungen:\n", | ||||
|     "                writer.writerow(meldung.to_row())\n", | ||||
|     "    \n", | ||||
|     "    print(f\"File '{filename}' created\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 14, | ||||
|    "id": "c5d0bdbd", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def store_month(year, month):\n", | ||||
|     "    month_end_day = calendar.monthrange(year, month)[1]\n", | ||||
|     "    \n", | ||||
|     "    for i in range(0, month_end_day):\n", | ||||
|     "        store_meldungen_in_csv(year, month, i+1)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "d9f3e24b", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Auswertung: Wie viele Einträge pro Bundesland?" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "9f600d3c", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "Für fortführende Visualisierung und um zu testen, ob der Algorithmus richtig funktioniert, werden hier alle Pressemitteilungen aller Bundesländer ausgezählt:" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 51, | ||||
|    "id": "b7c85078", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "counter = {}\n", | ||||
|     "\n", | ||||
|     "for filename in os.listdir('../data/'):\n", | ||||
|     "    if filename.endswith(\"_presseportal.csv\"):\n", | ||||
|     "        path = '../data/' + filename\n", | ||||
|     "        \n", | ||||
|     "        with open(path, 'r', encoding='UTF8') as f_in:\n", | ||||
|     "            reader = csv.reader(f_in)\n", | ||||
|     "            next(reader)\n", | ||||
|     "            for row in reader:\n", | ||||
|     "                bundesland = row[3]\n", | ||||
|     "                if bundesland not in counter:\n", | ||||
|     "                    counter[bundesland] = 1\n", | ||||
|     "                else:\n", | ||||
|     "                    counter[bundesland] += 1\n" | ||||
|    ] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "python-scientific kernel", | ||||
|    "language": "python", | ||||
|    "name": "python-scientific" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.10.9" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
							
								
								
									
										1787
									
								
								ergebnisse_hackathon_repo/team-16/notebooks/simons-notebook.ipynb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1787
									
								
								ergebnisse_hackathon_repo/team-16/notebooks/simons-notebook.ipynb
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -0,0 +1,40 @@ | |||
| """ | ||||
| Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding | ||||
| police station (user_id; name; handle) | ||||
| """ | ||||
| 
 | ||||
| import pandas as pd | ||||
| from match_blaulich_tw_accounts import extend_blaulicht_data | ||||
| 
 | ||||
| tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv') | ||||
| tw_user_data = pd.read_csv(r'data\copbird_table_user.csv') | ||||
| tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t') | ||||
| 
 | ||||
| 
 | ||||
| def get_tweets_by_user_id(): | ||||
|     tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename( | ||||
|         columns={"id": "tweet_id"}) | ||||
|     grouped_tweets = tweet_ids_user_ids.groupby('user_id') | ||||
|     return grouped_tweets | ||||
| 
 | ||||
| 
 | ||||
| def add_state_to_user_df(): | ||||
|     tw_user_df = tw_user_data.rename(columns={"id": "user_id"}) | ||||
|     tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"}) | ||||
| 
 | ||||
|     return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left') | ||||
| 
 | ||||
| 
 | ||||
| def add_state_to_tweets_df(): | ||||
|     tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle' | ||||
|                                                                 ]], on='user_id', how='left') | ||||
|     return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland' | ||||
|                           ]].rename(columns={'id': 'tweet_id', 'name': 'user_name'}) | ||||
| 
 | ||||
| 
 | ||||
| def save_to_csv(df: pd, file_name: str): | ||||
|     df.to_csv(path_or_buf=f'{file_name}.csv', index=False) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal') | ||||
|  | @ -0,0 +1,44 @@ | |||
| import pandas as pd | ||||
| from os import listdir | ||||
| from os.path import join, isdir | ||||
| 
 | ||||
| df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'}) | ||||
| dir_blaulicht = 'data/presseportal' | ||||
| 
 | ||||
| def concat_blaulicht_dfs(): | ||||
|     df = pd.DataFrame() | ||||
|     for dir in listdir(dir_blaulicht): | ||||
|         dir = join(dir_blaulicht, dir) | ||||
|         if isdir(dir): | ||||
|             for f in listdir(dir): | ||||
|                 f = join(dir, f) | ||||
|                 csv = pd.read_csv(f) | ||||
|                 df = df.append(csv) | ||||
|     return df | ||||
| 
 | ||||
| def extend_blaulicht_data(): | ||||
|     df_blaulicht = concat_blaulicht_dfs() | ||||
|     mapping = map_bl_tw_citys() | ||||
|     df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping)) | ||||
|     return df_blaulicht | ||||
| 
 | ||||
| def find_location(txt, mp): | ||||
|     mapped_blaulicht = mp.get(txt, "") | ||||
|     return mapped_blaulicht[1] if mapped_blaulicht != "" else "" | ||||
| 
 | ||||
| def map_bl_tw_citys(): | ||||
|     import re | ||||
|     df_blaulicht = concat_blaulicht_dfs() | ||||
|     df_blaulicht.sort_index(inplace=True) | ||||
|     tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None)) | ||||
|     tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1] | ||||
|     bl_locations = list(set([str(city) for city in df_blaulicht['location'].values])) | ||||
|     bl_tw_locations = {} | ||||
|     for bl_loc in bl_locations: | ||||
|         for tw_loc, tw_id in tw_locations: | ||||
|             if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()): | ||||
|                 bl_tw_locations[bl_loc] = [tw_loc, tw_id] | ||||
|     return bl_tw_locations | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     extend_blaulicht_data() | ||||
|  | @ -0,0 +1,40 @@ | |||
| """ | ||||
| Aim: Building a connection between all tweets (tweet-id) and the state (Bundesland; Stadt) of the corresponding | ||||
| police station (user_id; name; handle) | ||||
| """ | ||||
| 
 | ||||
| import pandas as pd | ||||
| from match_blaulich_tw_accounts import extend_blaulicht_data | ||||
| 
 | ||||
| tw_tweets = pd.read_csv(r'data\copbird_table_tweet.csv') | ||||
| tw_user_data = pd.read_csv(r'data\copbird_table_user.csv') | ||||
| tw_pol_geo_data = pd.read_csv(r'geolocations\polizei_accounts_geo.csv', delimiter='\t') | ||||
| 
 | ||||
| 
 | ||||
| def get_tweets_by_user_id(): | ||||
|     tweet_ids_user_ids = pd.DataFrame(tw_tweets, columns=['user_id', 'id'], dtype=str).rename( | ||||
|         columns={"id": "tweet_id"}) | ||||
|     grouped_tweets = tweet_ids_user_ids.groupby('user_id') | ||||
|     return grouped_tweets | ||||
| 
 | ||||
| 
 | ||||
| def add_state_to_user_df(): | ||||
|     tw_user_df = tw_user_data.rename(columns={"id": "user_id"}) | ||||
|     tw_pol_geo_df = tw_pol_geo_data.rename(columns={"Name": "name", "Bundesland": "bundesland", "Stadt": "stadt"}) | ||||
| 
 | ||||
|     return pd.merge(tw_user_df, tw_pol_geo_df[['name', 'stadt', 'bundesland']], on='name', how='left') | ||||
| 
 | ||||
| 
 | ||||
| def add_state_to_tweets_df(): | ||||
|     tw_tweets_ext = pd.merge(tw_tweets, add_state_to_user_df()[['user_id', 'stadt', 'bundesland', 'name', 'handle' | ||||
|                                                                 ]], on='user_id', how='left') | ||||
|     return tw_tweets_ext[['id', 'tweet_text', 'created_at', 'user_id', 'name', 'handle', 'stadt', 'bundesland' | ||||
|                           ]].rename(columns={'id': 'tweet_id', 'name': 'user_name'}) | ||||
| 
 | ||||
| 
 | ||||
| def save_to_csv(df: pd, file_name: str): | ||||
|     df.to_csv(path_or_buf=f'{file_name}.csv', index=False) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     save_to_csv(extend_blaulicht_data(), '2020-12_2021-05_presseportal') | ||||
|  | @ -0,0 +1,44 @@ | |||
| import pandas as pd | ||||
| from os import listdir | ||||
| from os.path import join, isdir | ||||
| 
 | ||||
| df_tw_user = pd.read_csv('copbird_table_user_ext.csv').rename(columns={'name': 'user_name'}) | ||||
| dir_blaulicht = 'data/presseportal' | ||||
| 
 | ||||
| def concat_blaulicht_dfs(): | ||||
|     df = pd.DataFrame() | ||||
|     for dir in listdir(dir_blaulicht): | ||||
|         dir = join(dir_blaulicht, dir) | ||||
|         if isdir(dir): | ||||
|             for f in listdir(dir): | ||||
|                 f = join(dir, f) | ||||
|                 csv = pd.read_csv(f) | ||||
|                 df = df.append(csv) | ||||
|     return df | ||||
| 
 | ||||
| def extend_blaulicht_data(): | ||||
|     df_blaulicht = concat_blaulicht_dfs() | ||||
|     mapping = map_bl_tw_citys() | ||||
|     df_blaulicht['tw_user_id'] = df_blaulicht['location'].apply(lambda x: find_location(x, mapping)) | ||||
|     return df_blaulicht | ||||
| 
 | ||||
| def find_location(txt, mp): | ||||
|     mapped_blaulicht = mp.get(txt, "") | ||||
|     return mapped_blaulicht[1] if mapped_blaulicht != "" else "" | ||||
| 
 | ||||
| def map_bl_tw_citys(): | ||||
|     import re | ||||
|     df_blaulicht = concat_blaulicht_dfs() | ||||
|     df_blaulicht.sort_index(inplace=True) | ||||
|     tw_locations = list(df_tw_user[['stadt', 'user_id']].itertuples(index=False, name=None)) | ||||
|     tw_locations = [(loc, id) for loc, id in tw_locations if len(str(loc)) > 1] | ||||
|     bl_locations = list(set([str(city) for city in df_blaulicht['location'].values])) | ||||
|     bl_tw_locations = {} | ||||
|     for bl_loc in bl_locations: | ||||
|         for tw_loc, tw_id in tw_locations: | ||||
|             if re.search(r'\b' + re.escape(str(tw_loc).lower()) + r'\b', str(bl_loc).lower()): | ||||
|                 bl_tw_locations[bl_loc] = [tw_loc, tw_id] | ||||
|     return bl_tw_locations | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     extend_blaulicht_data() | ||||
							
								
								
									
										512
									
								
								ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										512
									
								
								ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,512 @@ | |||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") | ||||
| tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") | ||||
| pm_text <- pm$content | ||||
| pm_text <- pm_text[-which(is.na(pm_text))]  # remove missing values | ||||
| length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots" | ||||
| length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots" | ||||
| pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - > | ||||
| pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - > | ||||
| pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...> | ||||
| pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...> | ||||
| content_ber <- rep(NA, nrow(pm)) | ||||
| content_ber <- rep(NA, nrow(pm)) | ||||
| content_ber[which(!is.na(pm$content))] <- pm_text | ||||
| content_ber[which(!is.na(pm$content))] <- pm_text | ||||
| pm <- cbind(pm, content_ber) | ||||
| pm_text <- gsub("[^[:alnum:] ]", "", pm_text) | ||||
| pm_text <- gsub("[^[:alnum:] ]", "", pm_text) | ||||
| content_ber_satzzeichen <- rep(NA, nrow(pm)) | ||||
| content_ber_satzzeichen <- rep(NA, nrow(pm)) | ||||
| content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text | ||||
| pm <- cbind(pm, content_ber_satzzeichen) | ||||
| head(pm) | ||||
| pm_text <- pm_demo$content | ||||
| pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - > | ||||
| pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...> | ||||
| content_ber <- rep(NA, nrow(pm_demo)) | ||||
| content_ber[which(!is.na(pm_demo$content))] <- pm_text | ||||
| pm_demo <- cbind(pm_demo, content_ber) | ||||
| pm_text <- gsub("[^[:alnum:] ]", "", pm_text) | ||||
| content_ber_satzzeichen <- rep(NA, nrow(pm_demo)) | ||||
| content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text | ||||
| pm_demo <- cbind(pm_demo, content_ber_satzzeichen) | ||||
| head(pm_demo) | ||||
| readAndflattenSentiWS <- function(filename) { | ||||
| words = readLines(filename, encoding="UTF-8") | ||||
| words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) | ||||
| words <- unlist(strsplit(words, ",")) | ||||
| words <- tolower(words) | ||||
| return(words) | ||||
| } | ||||
| pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/positive-words.txt")) | ||||
| neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/negative-words.txt")) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | ||||
| { | ||||
| #require(plyr) | ||||
| require(stringr) | ||||
| scores = laply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # we just want a TRUE/FALSE: | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | ||||
| { | ||||
| #require(plyr) | ||||
| require(stringr) | ||||
| scores = lapply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # we just want a TRUE/FALSE: | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | ||||
| { | ||||
| #require(plyr) | ||||
| require(stringr) | ||||
| scores = lapply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # we just want a TRUE/FALSE: | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| library(plyr) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | ||||
| { | ||||
| require(plyr) | ||||
| require(stringr) | ||||
| scores = lapply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # we just want a TRUE/FALSE: | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | ||||
| { | ||||
| require(plyr) | ||||
| require(stringr) | ||||
| scores = lapply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # we just want a TRUE/FALSE: | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | ||||
| require(plyr) | ||||
| require(stringr) | ||||
| scores = laply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # I don't just want a TRUE/FALSE! How can I do this? | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| ggplot(score_tw_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
| theme_minimal() | ||||
| View(score_tw_demo) | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| library(tidyverse) | ||||
| library(stringi) | ||||
| pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") | ||||
| tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") | ||||
| pm_text <- pm$content | ||||
| pm_text <- pm_text[-which(is.na(pm_text))]  # remove missing values | ||||
| length(grep("(ots)", pm_text)) == length(pm_text)  # every report contains "ots" | ||||
| pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - > | ||||
| pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...> | ||||
| content_ber <- rep(NA, nrow(pm)) | ||||
| content_ber[which(!is.na(pm$content))] <- pm_text | ||||
| pm <- cbind(pm, content_ber) | ||||
| pm_text <- gsub("[^[:alnum:] ]", "", pm_text) | ||||
| content_ber_satzzeichen <- rep(NA, nrow(pm)) | ||||
| content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text | ||||
| pm <- cbind(pm, content_ber_satzzeichen) | ||||
| head(pm) | ||||
| # csvpath <- <your path> | ||||
| # write_csv(pm, str_c(csvpath, "/pressemeldungen.csv")) | ||||
| pm_text <- pm_demo$content | ||||
| pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - > | ||||
| pm_text <- gsub("( \\.\\.\\.)$", "", pm_text)  # remove < ...> | ||||
| content_ber <- rep(NA, nrow(pm_demo)) | ||||
| content_ber[which(!is.na(pm_demo$content))] <- pm_text | ||||
| pm_demo <- cbind(pm_demo, content_ber) | ||||
| pm_text <- gsub("[^[:alnum:] ]", "", pm_text) | ||||
| content_ber_satzzeichen <- rep(NA, nrow(pm_demo)) | ||||
| content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text | ||||
| pm_demo <- cbind(pm_demo, content_ber_satzzeichen) | ||||
| head(pm_demo) | ||||
| readAndflattenSentiWS <- function(filename) { | ||||
| words = readLines(filename, encoding="UTF-8") | ||||
| words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) | ||||
| words <- unlist(strsplit(words, ",")) | ||||
| words <- tolower(words) | ||||
| return(words) | ||||
| } | ||||
| pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/positive-words.txt")) | ||||
| neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/negative-words.txt")) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | ||||
| require(plyr) | ||||
| require(stringr) | ||||
| scores = laply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # I don't just want a TRUE/FALSE! How can I do this? | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| ggplot(score_tw_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
| theme_minimal() | ||||
| View(score_tw_demo) | ||||
| Ciew(score_pm_demo) | ||||
| View(score_pm_demo) | ||||
| score_pm_demo$text[3] | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| library(tidyverse) | ||||
| library(stringi) | ||||
| # Read in data | ||||
| pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| summary(pm) | ||||
| tweets <- read_csv("data/copbird_table_tweet.csv") | ||||
| tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4] | ||||
| usersX <- read_csv("data/copbird_table_user_ext.csv") | ||||
| # tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv") | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Polizei Oldenburg-Stadt/Ammerl"] <- "Oldenburg" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Polizei Mecklenburgische Seenp"] <- "Neubrandenburg" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Polizei Wilhelmshaven/Frieslan"] <- "Wilhelmshaven" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Baden-Württember"] <- "Stuttgart" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Landeskriminalamt Rheinland-Pf"] <- "Mainz" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Mitteldeutschlan"] <- "Pirna" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Polizei Delmenhorst/Oldenburg-"] <- "Delmenhorst" | ||||
| # tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Flughafen Frankf"] <- "Frankfurt" | ||||
| # blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv") | ||||
| # users <- read_csv("data/copbird_table_user.csv") | ||||
| # str(users) | ||||
| # users$name <- as.factor(users$name) | ||||
| # users$handle <- as.factor(users$handle) | ||||
| pm_orte <- pm %>% group_by(bundesland) %>% count(location) | ||||
| head(pm_orte) | ||||
| head(pm_orte %>% arrange(desc(n)), n = 20) | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| library(tidyverse) | ||||
| library(stringi) | ||||
| pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| tweets <- read_csv("data/copbird_table_tweet.csv") | ||||
| tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4] | ||||
| usersX <- read_csv("data/copbird_table_user_ext.csv") | ||||
| tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv") | ||||
| blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv") | ||||
| pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") | ||||
| tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") | ||||
| pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv") | ||||
| tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv") | ||||
| pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv") | ||||
| tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv") | ||||
| head(usersX) | ||||
| head(tweetXstate[, 5:8]) | ||||
| blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id) | ||||
| head(blaulicht[, -c(2, 5)]) | ||||
| land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id") | ||||
| land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_ | ||||
| land_tw <- land_tw %>% group_by(bundesland) %>% count() | ||||
| land_tw$bundesland <- as.factor(land_tw$bundesland) | ||||
| land_pm <- pm %>% group_by(bundesland) %>% count() | ||||
| land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin" | ||||
| land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland) | ||||
| land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland) | ||||
| land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland)) | ||||
| land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland") | ||||
| names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter") | ||||
| land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ] | ||||
| land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0 | ||||
| land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland) | ||||
| ggplot(land_pm_tw) + | ||||
| geom_col(aes(x = bundesland, y = count, fill = Plattform)) + | ||||
| scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
| facet_wrap(~Plattform) + | ||||
| coord_flip() + | ||||
| guides(fill = FALSE) + | ||||
| labs(title = "Anzahl der Pressemeldungen und Tweets", | ||||
| subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
| theme_minimal() | ||||
| ggplot(land_pm_tw) + | ||||
| geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") + | ||||
| scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
| coord_flip() + | ||||
| labs(title = "Anzahl der Pressemeldungen und Tweets", | ||||
| subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
| theme_minimal() | ||||
| readAndflattenSentiWS <- function(filename) { | ||||
| words = readLines(filename, encoding="UTF-8") | ||||
| words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) | ||||
| words <- unlist(strsplit(words, ",")) | ||||
| words <- tolower(words) | ||||
| return(words) | ||||
| } | ||||
| pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/positive-words.txt")) | ||||
| neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T), | ||||
| readAndflattenSentiWS("data/negative-words.txt")) | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | ||||
| require(plyr) | ||||
| require(stringr) | ||||
| scores = laply(sentences, function(sentence, pos.words, neg.words) | ||||
| { | ||||
| # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
| sentence = gsub('[[:punct:]]', '', sentence) | ||||
| sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
| sentence = gsub('\\d+', '', sentence) | ||||
| # and convert to lower case: | ||||
| sentence = tolower(sentence) | ||||
| # split into words. str_split is in the stringr package | ||||
| word.list = str_split(sentence, '\\s+') | ||||
| # sometimes a list() is one level of hierarchy too much | ||||
| words = unlist(word.list) | ||||
| # compare our words to the dictionaries of positive & negative terms | ||||
| pos.matches = match(words, pos.words) | ||||
| neg.matches = match(words, neg.words) | ||||
| # match() returns the position of the matched term or NA | ||||
| # I don't just want a TRUE/FALSE! How can I do this? | ||||
| pos.matches = !is.na(pos.matches) | ||||
| neg.matches = !is.na(neg.matches) | ||||
| # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
| score = sum(pos.matches) - sum(neg.matches) | ||||
| return(score) | ||||
| }, | ||||
| pos.words, neg.words, .progress=.progress ) | ||||
| scores.df = data.frame(score=scores, text=sentences) | ||||
| return(scores.df) | ||||
| } | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| ggplot(score_tw_demo) + | ||||
| geom_bar(aes(x = score), fill = "blue") + | ||||
| labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
| theme_minimal() | ||||
| score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_drogen) + | ||||
| geom_bar(aes(x = score), fill = "darkgreen") + | ||||
| labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| ggplot(score_tw_drogen) + | ||||
| geom_bar(aes(x = score), fill = "darkgreen") + | ||||
| labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
| theme_minimal() | ||||
| score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words) | ||||
| score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words) | ||||
| ggplot(score_pm_rass) + | ||||
| geom_bar(aes(x = score), fill = "purple") + | ||||
| labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
| theme_minimal() | ||||
| ggplot(score_tw_rass) + | ||||
| geom_bar(aes(x = score), fill = "purple") + | ||||
| labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") + | ||||
| theme_minimal() | ||||
|  | @ -0,0 +1,216 @@ | |||
| --- | ||||
| title: "Team 16" | ||||
| author: "Christian, Simon und Cuca" | ||||
| date: "23 5 2021" | ||||
| output: pdf_document | ||||
| --- | ||||
| 
 | ||||
| ```{r setup, include=FALSE} | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| ``` | ||||
| 
 | ||||
| # Daten einlesen | ||||
| ```{r, message = FALSE} | ||||
| library(tidyverse) | ||||
| library(stringi) | ||||
| 
 | ||||
| pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| 
 | ||||
| tweets <- read_csv("data/copbird_table_tweet.csv") | ||||
| tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4] | ||||
| usersX <- read_csv("data/copbird_table_user_ext.csv") | ||||
| tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv") | ||||
| blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv") | ||||
| 
 | ||||
| pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") | ||||
| tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") | ||||
| 
 | ||||
| pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv") | ||||
| tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv") | ||||
| 
 | ||||
| pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv") | ||||
| tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv") | ||||
| ``` | ||||
| 
 | ||||
| 
 | ||||
| # Scrapen der Pressemeldungen (seit Dezember 2020) | ||||
| 
 | ||||
| # Zuordnung von Orten der Pressemeldungen und Tweets | ||||
| ```{r} | ||||
| head(usersX) | ||||
| head(tweetXstate[, 5:8]) | ||||
| blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id) | ||||
| head(blaulicht[, -c(2, 5)]) | ||||
| ``` | ||||
| 
 | ||||
| # Anzahl Pressemeldungen vs. Tweets | ||||
| ```{r} | ||||
| land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id") | ||||
| land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_ | ||||
| land_tw <- land_tw %>% group_by(bundesland) %>% count() | ||||
| land_tw$bundesland <- as.factor(land_tw$bundesland) | ||||
| 
 | ||||
| land_pm <- pm %>% group_by(bundesland) %>% count() | ||||
| land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin" | ||||
| land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland) | ||||
| land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland) | ||||
| land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland)) | ||||
| 
 | ||||
| land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland") | ||||
| names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter") | ||||
| land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ] | ||||
| land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0 | ||||
| land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland) | ||||
| 
 | ||||
| ggplot(land_pm_tw) + | ||||
|   geom_col(aes(x = bundesland, y = count, fill = Plattform)) + | ||||
|   scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
|   facet_wrap(~Plattform) + | ||||
|   coord_flip() + | ||||
|   guides(fill = FALSE) + | ||||
|   labs(title = "Anzahl der Pressemeldungen und Tweets",  | ||||
|        subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(land_pm_tw) + | ||||
|   geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") + | ||||
|   scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
|   coord_flip() + | ||||
|   labs(title = "Anzahl der Pressemeldungen und Tweets",  | ||||
|        subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
|   theme_minimal() | ||||
| ``` | ||||
| 
 | ||||
| # Topic modelling | ||||
| ```{r, message=FALSE} | ||||
| # library(quanteda) | ||||
| # library(tidyverse) | ||||
| # library(topicmodels) | ||||
| # library(ldatuning) | ||||
| # library(stm) | ||||
| # library(wordcloud) | ||||
| #  | ||||
| # pm <- pm[!is.na(pm$content), ] | ||||
| # tok <- tokens(pm$content_ber_satzzeichen) | ||||
| # mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german")) | ||||
| # mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65) | ||||
| # # mydfm.trim | ||||
| #  | ||||
| # anzahl.themen <- 10 | ||||
| # anzahl.woerter <- 10 | ||||
| # dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels") | ||||
| # lda.modell <- LDA(dfm2topicmodels, anzahl.themen) | ||||
| # lda.modell | ||||
| # topmod <- as.data.frame(terms(lda.modell, anzahl.woerter)) | ||||
| # topmod | ||||
| #  | ||||
| # write_csv(topmod, "data/topicmodel.csv") | ||||
| ``` | ||||
| 
 | ||||
| ### Auswahl der Keywords | ||||
| `topic_1 = ['demonstr', 'kundgeb']` | ||||
| 
 | ||||
| `topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']` | ||||
| 
 | ||||
| `topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']` | ||||
| 
 | ||||
| `topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']` | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # Sentiment Analyse | ||||
| ```{r} | ||||
| readAndflattenSentiWS <- function(filename) {  | ||||
|   words = readLines(filename, encoding="UTF-8") | ||||
|   words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) | ||||
|   words <- unlist(strsplit(words, ",")) | ||||
|   words <- tolower(words) | ||||
|   return(words) | ||||
| } | ||||
| 
 | ||||
| pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),  | ||||
|                readAndflattenSentiWS("SentiWS/positive-words.txt")) | ||||
| neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),  | ||||
|               readAndflattenSentiWS("SentiWS/negative-words.txt")) | ||||
| 
 | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | ||||
|   require(plyr) | ||||
|   require(stringr) | ||||
|   scores = laply(sentences, function(sentence, pos.words, neg.words)  | ||||
|   { | ||||
|     # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
|     sentence = gsub('[[:punct:]]', '', sentence) | ||||
|     sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
|     sentence = gsub('\\d+', '', sentence) | ||||
|     # and convert to lower case: | ||||
|     sentence = tolower(sentence) | ||||
|     # split into words. str_split is in the stringr package | ||||
|     word.list = str_split(sentence, '\\s+') | ||||
|     # sometimes a list() is one level of hierarchy too much | ||||
|     words = unlist(word.list) | ||||
|     # compare our words to the dictionaries of positive & negative terms | ||||
|     pos.matches = match(words, pos.words) | ||||
|     neg.matches = match(words, neg.words) | ||||
|     # match() returns the position of the matched term or NA | ||||
|     # I don't just want a TRUE/FALSE! How can I do this? | ||||
|     pos.matches = !is.na(pos.matches) | ||||
|     neg.matches = !is.na(neg.matches) | ||||
|     # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
|     score = sum(pos.matches) - sum(neg.matches) | ||||
|     return(score) | ||||
|   },  | ||||
|   pos.words, neg.words, .progress=.progress ) | ||||
|   scores.df = data.frame(score=scores, text=sentences) | ||||
|   return(scores.df) | ||||
| } | ||||
| 
 | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_demo) + | ||||
|   geom_bar(aes(x = score), fill = "blue") + | ||||
|   labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_demo) + | ||||
|   geom_bar(aes(x = score), fill = "blue") + | ||||
|   labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_drogen) + | ||||
|   geom_bar(aes(x = score), fill = "darkgreen") + | ||||
|   labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_drogen) + | ||||
|   geom_bar(aes(x = score), fill = "darkgreen") + | ||||
|   labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words) | ||||
| score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_rass) + | ||||
|   geom_bar(aes(x = score), fill = "purple") + | ||||
|   labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_rass) + | ||||
|   geom_bar(aes(x = score), fill = "purple") + | ||||
|   labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| ``` | ||||
| 
 | ||||
| ```{r} | ||||
| sessionInfo() | ||||
| ``` | ||||
|  | @ -0,0 +1,216 @@ | |||
| --- | ||||
| title: "Team 16" | ||||
| author: "Christian, Simon und Cuca" | ||||
| date: "23 5 2021" | ||||
| output: pdf_document | ||||
| --- | ||||
| 
 | ||||
| ```{r setup, include=FALSE} | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| ``` | ||||
| 
 | ||||
| # Daten einlesen | ||||
| ```{r, message = FALSE} | ||||
| library(tidyverse) | ||||
| library(stringi) | ||||
| 
 | ||||
| pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) | ||||
| pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) | ||||
| pm_list <- lapply(pm_csv, read_csv) | ||||
| pm <- do.call(rbind, pm_list) | ||||
| 
 | ||||
| tweets <- read_csv("data/copbird_table_tweet.csv") | ||||
| tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4] | ||||
| usersX <- read_csv("data/copbird_table_user_ext.csv") | ||||
| tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv") | ||||
| blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv") | ||||
| 
 | ||||
| pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") | ||||
| tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") | ||||
| 
 | ||||
| pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv") | ||||
| tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv") | ||||
| 
 | ||||
| pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv") | ||||
| tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv") | ||||
| ``` | ||||
| 
 | ||||
| 
 | ||||
| # Scrapen der Pressemeldungen (seit Dezember 2020) | ||||
| 
 | ||||
| # Zuordnung von Orten der Pressemeldungen und Tweets | ||||
| ```{r} | ||||
| head(usersX) | ||||
| head(tweetXstate[, 5:8]) | ||||
| blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id) | ||||
| head(blaulicht[, -c(2, 5)]) | ||||
| ``` | ||||
| 
 | ||||
| # Anzahl Pressemeldungen vs. Tweets | ||||
| ```{r} | ||||
| land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id") | ||||
| land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_ | ||||
| land_tw <- land_tw %>% group_by(bundesland) %>% count() | ||||
| land_tw$bundesland <- as.factor(land_tw$bundesland) | ||||
| 
 | ||||
| land_pm <- pm %>% group_by(bundesland) %>% count() | ||||
| land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin" | ||||
| land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland) | ||||
| land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland) | ||||
| land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland)) | ||||
| 
 | ||||
| land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland") | ||||
| names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter") | ||||
| land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ] | ||||
| land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0 | ||||
| land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland) | ||||
| 
 | ||||
| ggplot(land_pm_tw) + | ||||
|   geom_col(aes(x = bundesland, y = count, fill = Plattform)) + | ||||
|   scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
|   facet_wrap(~Plattform) + | ||||
|   coord_flip() + | ||||
|   guides(fill = FALSE) + | ||||
|   labs(title = "Anzahl der Pressemeldungen und Tweets",  | ||||
|        subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(land_pm_tw) + | ||||
|   geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") + | ||||
|   scale_fill_manual(values = c("#CC6699", "#0099CC")) + | ||||
|   coord_flip() + | ||||
|   labs(title = "Anzahl der Pressemeldungen und Tweets",  | ||||
|        subtitle = "Im Zeitraum April bis Mai 2021") + | ||||
|   theme_minimal() | ||||
| ``` | ||||
| 
 | ||||
| # Topic modelling | ||||
| ```{r, message=FALSE} | ||||
| # library(quanteda) | ||||
| # library(tidyverse) | ||||
| # library(topicmodels) | ||||
| # library(ldatuning) | ||||
| # library(stm) | ||||
| # library(wordcloud) | ||||
| #  | ||||
| # pm <- pm[!is.na(pm$content), ] | ||||
| # tok <- tokens(pm$content_ber_satzzeichen) | ||||
| # mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german")) | ||||
| # mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65) | ||||
| # # mydfm.trim | ||||
| #  | ||||
| # anzahl.themen <- 10 | ||||
| # anzahl.woerter <- 10 | ||||
| # dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels") | ||||
| # lda.modell <- LDA(dfm2topicmodels, anzahl.themen) | ||||
| # lda.modell | ||||
| # topmod <- as.data.frame(terms(lda.modell, anzahl.woerter)) | ||||
| # topmod | ||||
| #  | ||||
| # write_csv(topmod, "data/topicmodel.csv") | ||||
| ``` | ||||
| 
 | ||||
| ### Auswahl der Keywords | ||||
| `topic_1 = ['demonstr', 'kundgeb']` | ||||
| 
 | ||||
| `topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']` | ||||
| 
 | ||||
| `topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']` | ||||
| 
 | ||||
| `topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']` | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # Sentiment Analyse | ||||
| ```{r} | ||||
| readAndflattenSentiWS <- function(filename) {  | ||||
|   words = readLines(filename, encoding="UTF-8") | ||||
|   words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) | ||||
|   words <- unlist(strsplit(words, ",")) | ||||
|   words <- tolower(words) | ||||
|   return(words) | ||||
| } | ||||
| 
 | ||||
| pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),  | ||||
|                readAndflattenSentiWS("SentiWS/positive-words.txt")) | ||||
| neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),  | ||||
|               readAndflattenSentiWS("SentiWS/negative-words.txt")) | ||||
| 
 | ||||
| score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | ||||
|   require(plyr) | ||||
|   require(stringr) | ||||
|   scores = laply(sentences, function(sentence, pos.words, neg.words)  | ||||
|   { | ||||
|     # clean up sentences with R's regex-driven global substitute, gsub(): | ||||
|     sentence = gsub('[[:punct:]]', '', sentence) | ||||
|     sentence = gsub('[[:cntrl:]]', '', sentence) | ||||
|     sentence = gsub('\\d+', '', sentence) | ||||
|     # and convert to lower case: | ||||
|     sentence = tolower(sentence) | ||||
|     # split into words. str_split is in the stringr package | ||||
|     word.list = str_split(sentence, '\\s+') | ||||
|     # sometimes a list() is one level of hierarchy too much | ||||
|     words = unlist(word.list) | ||||
|     # compare our words to the dictionaries of positive & negative terms | ||||
|     pos.matches = match(words, pos.words) | ||||
|     neg.matches = match(words, neg.words) | ||||
|     # match() returns the position of the matched term or NA | ||||
|     # I don't just want a TRUE/FALSE! How can I do this? | ||||
|     pos.matches = !is.na(pos.matches) | ||||
|     neg.matches = !is.na(neg.matches) | ||||
|     # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | ||||
|     score = sum(pos.matches) - sum(neg.matches) | ||||
|     return(score) | ||||
|   },  | ||||
|   pos.words, neg.words, .progress=.progress ) | ||||
|   scores.df = data.frame(score=scores, text=sentences) | ||||
|   return(scores.df) | ||||
| } | ||||
| 
 | ||||
| score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_demo) + | ||||
|   geom_bar(aes(x = score), fill = "blue") + | ||||
|   labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_demo) + | ||||
|   geom_bar(aes(x = score), fill = "blue") + | ||||
|   labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words) | ||||
| score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_drogen) + | ||||
|   geom_bar(aes(x = score), fill = "darkgreen") + | ||||
|   labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_drogen) + | ||||
|   geom_bar(aes(x = score), fill = "darkgreen") + | ||||
|   labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words) | ||||
| score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words) | ||||
| 
 | ||||
| ggplot(score_pm_rass) + | ||||
|   geom_bar(aes(x = score), fill = "purple") + | ||||
|   labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") + | ||||
|   theme_minimal() | ||||
| 
 | ||||
| ggplot(score_tw_rass) + | ||||
|   geom_bar(aes(x = score), fill = "purple") + | ||||
|   labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") + | ||||
|   theme_minimal() | ||||
| ``` | ||||
| 
 | ||||
| ```{r} | ||||
| sessionInfo() | ||||
| ``` | ||||
|  | @ -0,0 +1,13 @@ | |||
| Version: 1.0 | ||||
| 
 | ||||
| RestoreWorkspace: Default | ||||
| SaveWorkspace: Default | ||||
| AlwaysSaveHistory: Default | ||||
| 
 | ||||
| EnableCodeIndexing: Yes | ||||
| UseSpacesForTab: Yes | ||||
| NumSpacesForTab: 2 | ||||
| Encoding: UTF-8 | ||||
| 
 | ||||
| RnwWeave: Sweave | ||||
| LaTeX: pdfLaTeX | ||||
							
								
								
									
										131
									
								
								ergebnisse_hackathon_repo/team-16/requirements.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								ergebnisse_hackathon_repo/team-16/requirements.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,131 @@ | |||
| appdirs==1.4.4 | ||||
| argon2-cffi==20.1.0 | ||||
| async-generator==1.10 | ||||
| attrs==21.2.0 | ||||
| backcall==0.2.0 | ||||
| beautifulsoup4==4.9.3 | ||||
| bleach==3.3.0 | ||||
| blis==0.7.4 | ||||
| branca==0.4.2 | ||||
| bs4==0.0.1 | ||||
| catalogue==2.0.4 | ||||
| certifi==2020.12.5 | ||||
| cffi==1.14.5 | ||||
| chardet==4.0.0 | ||||
| click==7.1.2 | ||||
| cssselect==1.1.0 | ||||
| cycler==0.10.0 | ||||
| cymem==2.0.5 | ||||
| decorator==4.4.2 | ||||
| defusedxml==0.7.1 | ||||
| dill==0.3.3 | ||||
| docker==4.4.4 | ||||
| emoji==0.6.0 | ||||
| entrypoints==0.3 | ||||
| fake-useragent==0.1.11 | ||||
| filelock==3.0.12 | ||||
| folium==0.12.1 | ||||
| funcy==1.16 | ||||
| future==0.18.2 | ||||
| germansentiment==1.0.5 | ||||
| huggingface-hub==0.0.8 | ||||
| idna==2.10 | ||||
| ipykernel==5.5.5 | ||||
| ipython==7.23.1 | ||||
| ipython-genutils==0.2.0 | ||||
| ipywidgets==7.6.3 | ||||
| jedi==0.18.0 | ||||
| Jinja2 | ||||
| joblib==1.0.1 | ||||
| jsonpickle==2.0.0 | ||||
| jsonschema==3.2.0 | ||||
| jupyter==1.0.0 | ||||
| jupyter-client==6.1.12 | ||||
| jupyter-console==6.4.0 | ||||
| jupyter-core==4.7.1 | ||||
| jupyterlab-pygments==0.1.2 | ||||
| jupyterlab-widgets==1.0.0 | ||||
| kiwisolver==1.3.1 | ||||
| loguru==0.5.3 | ||||
| lxml==4.6.3 | ||||
| MarkupSafe==2.0.1 | ||||
| matplotlib==3.4.2 | ||||
| matplotlib-inline==0.1.2 | ||||
| mistune==0.8.4 | ||||
| multiprocess==0.70.11.1 | ||||
| murmurhash==1.0.5 | ||||
| nbclient==0.5.3 | ||||
| nbconvert==6.0.7 | ||||
| nbformat==5.1.3 | ||||
| nest-asyncio==1.5.1 | ||||
| networkx==2.5.1 | ||||
| nitter-scraper==0.5.0 | ||||
| notebook==6.4.0 | ||||
| numexpr==2.7.3 | ||||
| numpy==1.20.3 | ||||
| packaging==20.9 | ||||
| pandas==1.2.4 | ||||
| pandocfilters==1.4.3 | ||||
| parse==1.19.0 | ||||
| parso==0.8.2 | ||||
| pathy==0.5.2 | ||||
| pendulum==2.1.2 | ||||
| pexpect==4.8.0 | ||||
| pickleshare==0.7.5 | ||||
| Pillow==8.2.0 | ||||
| preshed==3.0.5 | ||||
| prometheus-client==0.10.1 | ||||
| prompt-toolkit==3.0.18 | ||||
| ptyprocess==0.7.0 | ||||
| pycparser==2.20 | ||||
| pydantic==1.7.4 | ||||
| pyee==8.1.0 | ||||
| Pygments==2.9.0 | ||||
| pyLDAvis==3.3.1 | ||||
| pyparsing==2.4.7 | ||||
| pyppeteer==0.2.5 | ||||
| pyquery==1.4.3 | ||||
| pyrsistent==0.17.3 | ||||
| python-dateutil==2.8.1 | ||||
| pytz==2021.1 | ||||
| pytzdata==2020.1 | ||||
| pyvis==0.1.9 | ||||
| pyzmq==22.0.3 | ||||
| qtconsole==5.1.0 | ||||
| QtPy==1.9.0 | ||||
| regex==2021.4.4 | ||||
| requests==2.25.1 | ||||
| requests-html==0.10.0 | ||||
| sacremoses==0.0.45 | ||||
| scikit-learn==0.24.2 | ||||
| scipy==1.6.3 | ||||
| seaborn==0.11.1 | ||||
| Send2Trash==1.5.0 | ||||
| six==1.16.0 | ||||
| sklearn==0.0 | ||||
| smart-open | ||||
| soupsieve==2.2.1 | ||||
| spacy==3.0.6 | ||||
| spacy-legacy==3.0.5 | ||||
| spacymoji==3.0.1 | ||||
| srsly==2.4.1 | ||||
| terminado==0.10.0 | ||||
| testpath==0.5.0 | ||||
| thinc==8.0.3 | ||||
| threadpoolctl==2.1.0 | ||||
| tokenizers==0.10.2 | ||||
| torch==1.8.1 | ||||
| tornado==6.1 | ||||
| tqdm==4.60.0 | ||||
| traitlets==5.0.5 | ||||
| transformers==4.6.0 | ||||
| typer==0.3.2 | ||||
| typing-extensions==3.10.0.0 | ||||
| urllib3==1.26.4 | ||||
| w3lib==1.22.0 | ||||
| wasabi==0.8.2 | ||||
| wcwidth==0.2.5 | ||||
| webencodings==0.5.1 | ||||
| websocket-client==1.0.0 | ||||
| websockets==8.1 | ||||
| widgetsnbextension==3.5.1 | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue