Compare commits
No commits in common. "36de4fdf81e939243787fdbffedc366da4437e9f" and "abe05ce248d72f9d7027dba1972c8b163a771630" have entirely different histories.
36de4fdf81
...
abe05ce248
8 changed files with 529 additions and 334150 deletions
|
@ -1,50 +0,0 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# Merging different table of old (~2021) and new (~2022) scraper
|
||||
|
||||
## cols: hashtag, url, mention (same for both)
|
||||
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper
|
||||
pd.read_csv("data/tweets.csv")]) # data from new scraper
|
||||
|
||||
## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
|
||||
tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),
|
||||
pd.read_csv("data/tweets-1679742698645.csv")])
|
||||
|
||||
## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
|
||||
tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),
|
||||
pd.read_csv("data/tweets-1679742620302.csv")])
|
||||
|
||||
## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
|
||||
## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
|
||||
tweets_user = pd.read_csv("data/user_old.tsv",
|
||||
sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names
|
||||
).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data
|
||||
).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names
|
||||
on = "user_id", # user_id as matching column
|
||||
how = "outer", # keep all unique uer_ids
|
||||
suffixes = ["_2021", "_2022"]) # identify column where username and label came from
|
||||
|
||||
## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
|
||||
tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
|
||||
user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)
|
||||
).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed
|
||||
|
||||
## addiditional information concerning the police stations
|
||||
## cols: handle, name, typ, bundesland, stadt, lat, long
|
||||
police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"
|
||||
).rename(columns = {"Polizei Account": "handle"})
|
||||
|
||||
# Merge statistics, tweet text and user information in one data frame
|
||||
tweets_combined = pd.merge(tweets_statistics,
|
||||
tweets_text,
|
||||
on = 'tweet_id').merge(tweets_user, on = 'user_id'
|
||||
).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)
|
||||
|
||||
# Convert datatypes to appropriate one
|
||||
tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
|
||||
tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format
|
||||
created_at = pd.to_datetime(tweets_combined['created_at']),
|
||||
handle = tweets_combined['handle'].str.lower(), # handle to lower case
|
||||
is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable
|
||||
tweets_combined.#to_csv("data/tweets_all_combined.csv")
|
|
@ -12,7 +12,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -37,7 +37,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": 117,
|
||||
"id": "fcc48831-7999-4d79-b722-736715b1ced6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -46,50 +46,48 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((479991, 3), (151690, 8), (151690, 4), (13327, 3), (163, 7))"
|
||||
"((479991, 3), (151690, 8), (151690, 4), (13327, 3))"
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"execution_count": 117,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Merging different table of old (~2021) and new (~2022) scraper\n",
|
||||
"\n",
|
||||
"## cols: hashtag, url, mention (same for both)\n",
|
||||
"tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n",
|
||||
" pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n",
|
||||
"\n",
|
||||
"## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)\n",
|
||||
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
|
||||
" 'tweet_text', \n",
|
||||
" 'created_at', \n",
|
||||
" 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
" pd.read_csv(\"data/tweets-1679742698645.csv\")])\n",
|
||||
"\n",
|
||||
"## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table\n",
|
||||
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
|
||||
" 'like_count', \n",
|
||||
" 'retweet_count', \n",
|
||||
" 'reply_count', \n",
|
||||
" 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
" pd.read_csv(\"data/tweets-1679742620302.csv\")])\n",
|
||||
"\n",
|
||||
"## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)\n",
|
||||
"## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged\n",
|
||||
"tweets_user = pd.read_csv(\"data/user_old.tsv\", \n",
|
||||
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} # uniform names\n",
|
||||
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\" # merge with renamed new data\n",
|
||||
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}), # reverse col names\n",
|
||||
" on = \"user_id\", # user_id as matching column\n",
|
||||
" how = \"outer\", # keep all unique uer_ids\n",
|
||||
" suffixes = [\"_2021\", \"_2022\"]) # identify column where username and label came from\n",
|
||||
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"}\n",
|
||||
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\"\n",
|
||||
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}),\n",
|
||||
" on = \"user_id\",\n",
|
||||
" how = \"outer\",\n",
|
||||
" suffixes = [\"_2021\", \"_2022\"])\n",
|
||||
"\n",
|
||||
"## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.\n",
|
||||
"# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n",
|
||||
"tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n",
|
||||
" user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n",
|
||||
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed\n",
|
||||
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n",
|
||||
"\n",
|
||||
"## addiditional information concerning the police stations\n",
|
||||
"## cols: handle, name, typ, bundesland, stadt, lat, long\n",
|
||||
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" \n",
|
||||
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n",
|
||||
" ).rename(columns = {\"Polizei Account\": \"handle\"})\n",
|
||||
"\n",
|
||||
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape, police_stations.shape"
|
||||
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -102,14 +100,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 118,
|
||||
"id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Merge statistics, tweet text and user information in one data frame\n",
|
||||
"# Merge like statistics, tweet text and user information in one data frame\n",
|
||||
"tweets_combined = pd.merge(tweets_statistics, \n",
|
||||
" tweets_text,\n",
|
||||
" on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
|
||||
|
@ -119,12 +117,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"execution_count": 119,
|
||||
"id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
|
||||
" output = repr(obj)\n",
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
|
||||
" return method()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
|
@ -169,8 +177,8 @@
|
|||
" <td>2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>@mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@mahanna196 Da die Stadt keine Ausnahme für Ra...</td>\n",
|
||||
" <td>2020-10-27 09:29:13</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -184,7 +192,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@mahanna196 Ja. *sr</td>\n",
|
||||
" <td>2020-10-27 10:35:38</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
|
@ -199,8 +207,8 @@
|
|||
" <td>3</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausge...</td>\n",
|
||||
" <td>2020-10-27 12:36:26</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -214,8 +222,8 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/...</td>\n",
|
||||
" <td>2020-10-27 12:59:06</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -229,8 +237,8 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>In der vergangenen Woche wurde die Wohnung des...</td>\n",
|
||||
" <td>2020-10-27 13:57:32</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -260,7 +268,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-19 13:40:36</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze...</td>\n",
|
||||
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...</td>\n",
|
||||
" <td>2023-02-15 12:06:07</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -275,7 +283,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-25 13:14:49</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp; Drogen + ...</td>\n",
|
||||
" <td>Unser Präventionsteam vom #A44 berät heute und...</td>\n",
|
||||
" <td>2023-02-21 12:10:00</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -290,7 +298,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-25 13:14:49</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp; Drogen im Straßenverkehr beraten...</td>\n",
|
||||
" <td>Auch unser #A52 war heute aktiv und hat zum Th...</td>\n",
|
||||
" <td>2023-02-21 12:12:48</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -305,7 +313,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-26 13:15:05</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb...</td>\n",
|
||||
" <td>Gestern führte unser #A13 in einer Wohnsiedlun...</td>\n",
|
||||
" <td>2023-02-22 11:15:58</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -320,7 +328,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-27 12:17:33</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk...</td>\n",
|
||||
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr....</td>\n",
|
||||
" <td>2023-02-23 10:53:07</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -346,11 +354,11 @@
|
|||
"151689 1628709531998998529 10 1 0 \n",
|
||||
"\n",
|
||||
" quote_count measured_at is_deleted \\\n",
|
||||
"0 0 NaT <NA> \n",
|
||||
"1 0 NaT <NA> \n",
|
||||
"2 0 NaT <NA> \n",
|
||||
"3 0 NaT <NA> \n",
|
||||
"4 0 NaT <NA> \n",
|
||||
"0 0 NaT NaN \n",
|
||||
"1 0 NaT NaN \n",
|
||||
"2 0 NaT NaN \n",
|
||||
"3 0 NaT NaN \n",
|
||||
"4 0 NaT NaN \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 0 2023-02-19 13:40:36 False \n",
|
||||
"151686 0 2023-02-25 13:14:49 False \n",
|
||||
|
@ -358,31 +366,31 @@
|
|||
"151688 0 2023-02-26 13:15:05 False \n",
|
||||
"151689 0 2023-02-27 12:17:33 False \n",
|
||||
"\n",
|
||||
" tweet_text \\\n",
|
||||
"0 @mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a... \n",
|
||||
"1 @mahanna196 Ja. *sr \n",
|
||||
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F... \n",
|
||||
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr \n",
|
||||
"4 In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be... \n",
|
||||
"... ... \n",
|
||||
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze... \n",
|
||||
"151686 Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol & Drogen + ... \n",
|
||||
"151687 Auch unser #A52 war heute aktiv und hat zum Thema Alkohol & Drogen im Straßenverkehr beraten... \n",
|
||||
"151688 Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb... \n",
|
||||
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk... \n",
|
||||
" tweet_text created_at \\\n",
|
||||
"0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n",
|
||||
"1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n",
|
||||
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n",
|
||||
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n",
|
||||
"4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n",
|
||||
"... ... ... \n",
|
||||
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n",
|
||||
"151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n",
|
||||
"151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n",
|
||||
"151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n",
|
||||
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n",
|
||||
"\n",
|
||||
" created_at user_id handle \\\n",
|
||||
"0 2020-10-27 09:29:13 778895426007203840 polizei_ol \n",
|
||||
"1 2020-10-27 10:35:38 778895426007203840 polizei_ol \n",
|
||||
"2 2020-10-27 12:36:26 778895426007203840 polizei_ol \n",
|
||||
"3 2020-10-27 12:59:06 778895426007203840 polizei_ol \n",
|
||||
"4 2020-10-27 13:57:32 778895426007203840 polizei_ol \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 2023-02-15 12:06:07 1168873095614160896 polizeiberlin_p \n",
|
||||
"151686 2023-02-21 12:10:00 1168873095614160896 polizeiberlin_p \n",
|
||||
"151687 2023-02-21 12:12:48 1168873095614160896 polizeiberlin_p \n",
|
||||
"151688 2023-02-22 11:15:58 1168873095614160896 polizeiberlin_p \n",
|
||||
"151689 2023-02-23 10:53:07 1168873095614160896 polizeiberlin_p \n",
|
||||
" user_id handle \\\n",
|
||||
"0 778895426007203840 polizei_ol \n",
|
||||
"1 778895426007203840 polizei_ol \n",
|
||||
"2 778895426007203840 polizei_ol \n",
|
||||
"3 778895426007203840 polizei_ol \n",
|
||||
"4 778895426007203840 polizei_ol \n",
|
||||
"... ... ... \n",
|
||||
"151685 1168873095614160896 polizeiberlin_p \n",
|
||||
"151686 1168873095614160896 polizeiberlin_p \n",
|
||||
"151687 1168873095614160896 polizeiberlin_p \n",
|
||||
"151688 1168873095614160896 polizeiberlin_p \n",
|
||||
"151689 1168873095614160896 polizeiberlin_p \n",
|
||||
"\n",
|
||||
" user_name \n",
|
||||
"0 Polizei Oldenburg-Stadt/Ammerland \n",
|
||||
|
@ -400,19 +408,19 @@
|
|||
"[151690 rows x 12 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"execution_count": 119,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Convert datatypes to appropriate one\n",
|
||||
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)\n",
|
||||
"# Convert Counts to integer values\n",
|
||||
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n",
|
||||
"tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n",
|
||||
" created_at = pd.to_datetime(tweets_combined['created_at']),\n",
|
||||
" handle = tweets_combined['handle'].str.lower(), # handle to lower case\n",
|
||||
" is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable\n",
|
||||
"tweets_combined#.to_csv(\"data/tweets_all_combined.csv\")"
|
||||
" handle = tweets_combined['handle'].str.lower(),\n",
|
||||
" is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n",
|
||||
"tweets_combined"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -464,7 +472,7 @@
|
|||
"source": [
|
||||
"## Metadaten \n",
|
||||
"\n",
|
||||
"Welche Daten bilden die Grundlage?\n"
|
||||
"Welche Daten bilden die Grundlage?"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -529,7 +537,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 114,
|
||||
"id": "4f1e8c6c-3610-436e-899e-4d0307259230",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -560,7 +568,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 122,
|
||||
"id": "9373552e-6baf-46df-ae16-c63603e20a83",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -673,7 +681,7 @@
|
|||
"61 Hamburg 53.550341 10.000654 "
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 122,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -702,23 +710,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 123,
|
||||
"id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n",
|
||||
" for col_name, dtype in df.dtypes.iteritems():\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
"<div id=\"altair-viz-7b78525a62b243eca7b1f4044a328f47\"></div>\n",
|
||||
"<div id=\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\"></div>\n",
|
||||
"<script type=\"text/javascript\">\n",
|
||||
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
|
||||
" (function(spec, embedOpt){\n",
|
||||
" let outputDiv = document.currentScript.previousElementSibling;\n",
|
||||
" if (outputDiv.id !== \"altair-viz-7b78525a62b243eca7b1f4044a328f47\") {\n",
|
||||
" outputDiv = document.getElementById(\"altair-viz-7b78525a62b243eca7b1f4044a328f47\");\n",
|
||||
" if (outputDiv.id !== \"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\") {\n",
|
||||
" outputDiv = document.getElementById(\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\");\n",
|
||||
" }\n",
|
||||
" const paths = {\n",
|
||||
" \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
|
||||
|
@ -764,14 +780,14 @@
|
|||
" .catch(showError)\n",
|
||||
" .then(() => displayChart(vegaEmbed));\n",
|
||||
" }\n",
|
||||
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"ordinal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
|
||||
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"nominal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
|
||||
"</script>"
|
||||
],
|
||||
"text/plain": [
|
||||
"alt.Chart(...)"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 123,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -779,7 +795,7 @@
|
|||
"source": [
|
||||
"barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n",
|
||||
" x = 'count:Q',\n",
|
||||
" y = alt.Y('handle:O', sort = '-x'),\n",
|
||||
" y = alt.Y('handle:N', sort = '-x'),\n",
|
||||
")\n",
|
||||
"barchart "
|
||||
]
|
||||
|
@ -796,7 +812,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 125,
|
||||
"id": "d0549250-b11f-4762-8500-1134c53303b4",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -804,377 +820,22 @@
|
|||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>index</th>\n",
|
||||
" <th>tweet_id</th>\n",
|
||||
" <th>like_count</th>\n",
|
||||
" <th>retweet_count</th>\n",
|
||||
" <th>reply_count</th>\n",
|
||||
" <th>quote_count</th>\n",
|
||||
" <th>measured_at</th>\n",
|
||||
" <th>is_deleted</th>\n",
|
||||
" <th>tweet_text</th>\n",
|
||||
" <th>created_at</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>handle</th>\n",
|
||||
" <th>user_name</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Typ</th>\n",
|
||||
" <th>Bundesland</th>\n",
|
||||
" <th>Stadt</th>\n",
|
||||
" <th>LAT</th>\n",
|
||||
" <th>LONG</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>3053</td>\n",
|
||||
" <td>1609539240458878979</td>\n",
|
||||
" <td>21455</td>\n",
|
||||
" <td>1845</td>\n",
|
||||
" <td>3643</td>\n",
|
||||
" <td>341</td>\n",
|
||||
" <td>2023-01-05 14:44:34</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une...</td>\n",
|
||||
" <td>2023-01-01 13:17:13</td>\n",
|
||||
" <td>2397974054</td>\n",
|
||||
" <td>polizeiberlin</td>\n",
|
||||
" <td>Polizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1331</td>\n",
|
||||
" <td>1355179228396879872</td>\n",
|
||||
" <td>19186</td>\n",
|
||||
" <td>3386</td>\n",
|
||||
" <td>1203</td>\n",
|
||||
" <td>628</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...</td>\n",
|
||||
" <td>2021-01-29 15:41:20</td>\n",
|
||||
" <td>2397974054</td>\n",
|
||||
" <td>polizeiberlin</td>\n",
|
||||
" <td>Polizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>91693</td>\n",
|
||||
" <td>1505620459148173316</td>\n",
|
||||
" <td>15708</td>\n",
|
||||
" <td>7098</td>\n",
|
||||
" <td>186</td>\n",
|
||||
" <td>540</td>\n",
|
||||
" <td>2022-03-24 20:15:08</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...</td>\n",
|
||||
" <td>2022-03-20 19:01:05</td>\n",
|
||||
" <td>2389161066</td>\n",
|
||||
" <td>polizei_nrw_bn</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Bonn</td>\n",
|
||||
" <td>50.735851</td>\n",
|
||||
" <td>7.10066</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>91695</td>\n",
|
||||
" <td>1505620666476896259</td>\n",
|
||||
" <td>10337</td>\n",
|
||||
" <td>1539</td>\n",
|
||||
" <td>59</td>\n",
|
||||
" <td>35</td>\n",
|
||||
" <td>2022-03-24 20:15:08</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...</td>\n",
|
||||
" <td>2022-03-20 19:01:54</td>\n",
|
||||
" <td>2389161066</td>\n",
|
||||
" <td>polizei_nrw_bn</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Bonn</td>\n",
|
||||
" <td>50.735851</td>\n",
|
||||
" <td>7.10066</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>122631</td>\n",
|
||||
" <td>1359098196434292739</td>\n",
|
||||
" <td>9471</td>\n",
|
||||
" <td>642</td>\n",
|
||||
" <td>128</td>\n",
|
||||
" <td>102</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2</td>\n",
|
||||
" <td>2021-02-09 11:13:55</td>\n",
|
||||
" <td>4876039738</td>\n",
|
||||
" <td>bpol_b</td>\n",
|
||||
" <td>Bundespolizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151685</th>\n",
|
||||
" <td>7569</td>\n",
|
||||
" <td>1332625325654757377</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...</td>\n",
|
||||
" <td>2020-11-28 10:00:11</td>\n",
|
||||
" <td>223758384</td>\n",
|
||||
" <td>polizeisachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Sachsen</td>\n",
|
||||
" <td>Dresden</td>\n",
|
||||
" <td>51.0493286</td>\n",
|
||||
" <td>13.7381437</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151686</th>\n",
|
||||
" <td>7572</td>\n",
|
||||
" <td>1332738525507186692</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...</td>\n",
|
||||
" <td>2020-11-28 17:30:00</td>\n",
|
||||
" <td>223758384</td>\n",
|
||||
" <td>polizeisachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Sachsen</td>\n",
|
||||
" <td>Dresden</td>\n",
|
||||
" <td>51.0493286</td>\n",
|
||||
" <td>13.7381437</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151687</th>\n",
|
||||
" <td>144702</td>\n",
|
||||
" <td>1465679768494526467</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...</td>\n",
|
||||
" <td>2021-11-30 13:51:02</td>\n",
|
||||
" <td>4876085224</td>\n",
|
||||
" <td>bpol_nord</td>\n",
|
||||
" <td>Bundespolizei Nord</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151688</th>\n",
|
||||
" <td>144701</td>\n",
|
||||
" <td>1464124290605977600</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...</td>\n",
|
||||
" <td>2021-11-26 06:50:07</td>\n",
|
||||
" <td>4876085224</td>\n",
|
||||
" <td>bpol_nord</td>\n",
|
||||
" <td>Bundespolizei Nord</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151689</th>\n",
|
||||
" <td>66854</td>\n",
|
||||
" <td>1376453040283209728</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>#Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...</td>\n",
|
||||
" <td>2021-03-29 08:35:52</td>\n",
|
||||
" <td>2389263558</td>\n",
|
||||
" <td>polizei_nrw_un</td>\n",
|
||||
" <td>Polizei NRW UN</td>\n",
|
||||
" <td>Polizei NRW UN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Unna</td>\n",
|
||||
" <td>51.5348835</td>\n",
|
||||
" <td>7.689014</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>151690 rows × 19 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" index tweet_id like_count retweet_count reply_count \\\n",
|
||||
"0 3053 1609539240458878979 21455 1845 3643 \n",
|
||||
"1 1331 1355179228396879872 19186 3386 1203 \n",
|
||||
"2 91693 1505620459148173316 15708 7098 186 \n",
|
||||
"3 91695 1505620666476896259 10337 1539 59 \n",
|
||||
"4 122631 1359098196434292739 9471 642 128 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"151685 7569 1332625325654757377 -99 -99 -99 \n",
|
||||
"151686 7572 1332738525507186692 -99 -99 -99 \n",
|
||||
"151687 144702 1465679768494526467 -99 -99 -99 \n",
|
||||
"151688 144701 1464124290605977600 -99 -99 -99 \n",
|
||||
"151689 66854 1376453040283209728 -99 -99 -99 \n",
|
||||
"\n",
|
||||
" quote_count measured_at is_deleted \\\n",
|
||||
"0 341 2023-01-05 14:44:34 False \n",
|
||||
"1 628 NaT NaN \n",
|
||||
"2 540 2022-03-24 20:15:08 False \n",
|
||||
"3 35 2022-03-24 20:15:08 False \n",
|
||||
"4 102 NaT NaN \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 -99 NaT NaN \n",
|
||||
"151686 -99 NaT NaN \n",
|
||||
"151687 -99 NaT NaN \n",
|
||||
"151688 -99 NaT NaN \n",
|
||||
"151689 -99 NaT NaN \n",
|
||||
"\n",
|
||||
" tweet_text \\\n",
|
||||
"0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n",
|
||||
"1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n",
|
||||
"2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n",
|
||||
"3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n",
|
||||
"4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n",
|
||||
"... ... \n",
|
||||
" ... \n",
|
||||
"151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n",
|
||||
"151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n",
|
||||
"151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n",
|
||||
"151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n",
|
||||
"151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n",
|
||||
"\n",
|
||||
" created_at user_id handle user_name \\\n",
|
||||
"0 2023-01-01 13:17:13 2397974054 polizeiberlin Polizei Berlin \n",
|
||||
"1 2021-01-29 15:41:20 2397974054 polizeiberlin Polizei Berlin \n",
|
||||
"2 2022-03-20 19:01:05 2389161066 polizei_nrw_bn Polizei NRW BN \n",
|
||||
"3 2022-03-20 19:01:54 2389161066 polizei_nrw_bn Polizei NRW BN \n",
|
||||
"4 2021-02-09 11:13:55 4876039738 bpol_b Bundespolizei Berlin \n",
|
||||
"... ... ... ... ... \n",
|
||||
"151685 2020-11-28 10:00:11 223758384 polizeisachsen Polizei Sachsen \n",
|
||||
"151686 2020-11-28 17:30:00 223758384 polizeisachsen Polizei Sachsen \n",
|
||||
"151687 2021-11-30 13:51:02 4876085224 bpol_nord Bundespolizei Nord \n",
|
||||
"151688 2021-11-26 06:50:07 4876085224 bpol_nord Bundespolizei Nord \n",
|
||||
"151689 2021-03-29 08:35:52 2389263558 polizei_nrw_un Polizei NRW UN \n",
|
||||
"\n",
|
||||
" Name Typ Bundesland Stadt LAT \\\n",
|
||||
"0 NaN NaN NaN NaN NaN \n",
|
||||
"1 NaN NaN NaN NaN NaN \n",
|
||||
"2 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
|
||||
"3 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
|
||||
"4 NaN NaN NaN NaN NaN \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"151685 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
|
||||
"151686 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
|
||||
"151687 NaN NaN NaN NaN NaN \n",
|
||||
"151688 NaN NaN NaN NaN NaN \n",
|
||||
"151689 Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 \n",
|
||||
"\n",
|
||||
" LONG \n",
|
||||
"0 NaN \n",
|
||||
"1 NaN \n",
|
||||
"2 7.10066 \n",
|
||||
"3 7.10066 \n",
|
||||
"4 NaN \n",
|
||||
"... ... \n",
|
||||
"151685 13.7381437 \n",
|
||||
"151686 13.7381437 \n",
|
||||
"151687 NaN \n",
|
||||
"151688 NaN \n",
|
||||
"151689 7.689014 \n",
|
||||
"\n",
|
||||
"[151690 rows x 19 columns]"
|
||||
"Name: tweet_text, Length: 151690, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 125,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1184,14 +845,14 @@
|
|||
" on = \"handle\",\n",
|
||||
" how = \"left\")\n",
|
||||
"pd.options.display.max_colwidth = 100\n",
|
||||
"tweets_attention.sort_values('like_count', ascending = False).reset_index()\n",
|
||||
"tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "621a3b74-e909-435c-8820-b38b63aa4893",
|
||||
"execution_count": 90,
|
||||
"id": "97952234-7957-421e-bd2c-2c8261992c5a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
|
@ -1311,12 +972,144 @@
|
|||
"[11559 rows x 3 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"execution_count": 90,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
"source": [
|
||||
"old = pd.read_csv(\"data/user_old.tsv\",sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} )\n",
|
||||
"new = pd.read_csv(\"data/tweets-1679742702794.csv\").rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"})\n",
|
||||
"new"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 121,
|
||||
"id": "ed86b45e-9dd8-436d-9c96-15500ed93985",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th>count</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>user_name</th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>223758384</th>\n",
|
||||
" <th>Polizei Sachsen</th>\n",
|
||||
" <td>5340</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>259607457</th>\n",
|
||||
" <th>Polizei NRW K</th>\n",
|
||||
" <td>2544</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>424895827</th>\n",
|
||||
" <th>Polizei Stuttgart</th>\n",
|
||||
" <td>1913</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>769128278</th>\n",
|
||||
" <th>Polizei NRW DO</th>\n",
|
||||
" <td>4895</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>775664780</th>\n",
|
||||
" <th>Polizei Rostock</th>\n",
|
||||
" <td>604</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1169206134189830145</th>\n",
|
||||
" <th>Polizei Stendal</th>\n",
|
||||
" <td>842</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1184022676488314880</th>\n",
|
||||
" <th>Polizei Pforzheim</th>\n",
|
||||
" <td>283</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1184024283342950401</th>\n",
|
||||
" <th>Polizei Ravensburg</th>\n",
|
||||
" <td>460</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1232548941889228808</th>\n",
|
||||
" <th>Systemstratege:</th>\n",
|
||||
" <td>168</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1295978598034284546</th>\n",
|
||||
" <th>Polizei ZPD NI</th>\n",
|
||||
" <td>133</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>163 rows × 1 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" count\n",
|
||||
"user_id user_name \n",
|
||||
"223758384 Polizei Sachsen 5340\n",
|
||||
"259607457 Polizei NRW K 2544\n",
|
||||
"424895827 Polizei Stuttgart 1913\n",
|
||||
"769128278 Polizei NRW DO 4895\n",
|
||||
"775664780 Polizei Rostock 604\n",
|
||||
"... ...\n",
|
||||
"1169206134189830145 Polizei Stendal 842\n",
|
||||
"1184022676488314880 Polizei Pforzheim 283\n",
|
||||
"1184024283342950401 Polizei Ravensburg 460\n",
|
||||
"1232548941889228808 Systemstratege: 168\n",
|
||||
"1295978598034284546 Polizei ZPD NI 133\n",
|
||||
"\n",
|
||||
"[163 rows x 1 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 121,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n",
|
||||
" )[\"user_id\"].aggregate(['count']\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
Binary file not shown.
|
@ -1 +1 @@
|
|||
{"data":{"layout-restorer:data":{"main":{"dock":{"type":"split-area","orientation":"horizontal","sizes":[0.9093610698365527,0.09063893016344725],"children":[{"type":"tab-area","currentIndex":0,"widgets":["notebook:zusammenfassung.ipynb","editor:merge_police_tweets.py"]},{"type":"tab-area","currentIndex":0,"widgets":["inspector:inspector"]}]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.26146046543024176,0.7385395345697582,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}},"inspector:inspector":{"data":{}},"editor:merge_police_tweets.py":{"data":{"path":"merge_police_tweets.py","factory":"Editor"}}},"metadata":{"id":"default"}}
|
||||
{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:zusammenfassung.ipynb"]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17943235504652827,0.8205676449534718,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}}
|
12
README.md
12
README.md
|
@ -1,12 +0,0 @@
|
|||
# Copbird Webside Project
|
||||
|
||||
In diesem Repository sollen relavante Daten und Code für die Copbird Webseite gesammelt werden. Alle Daten befinden sich im Verzeichnis `data/`. Im Verzeichnis `ergebnisse_hackathon_repo/` befindet sich ein fork des Ergbnis-Repo vom Hackathon. Im Jupyter-Notebook `zusammenfassung.ipynb` sollten relevante Analyse Schritte Dokumentiert werden, bisher ist dort aber vornehmlich eine erneute Datenaufbereitung Dokumentiert. Zu besseren Interoperabilität, ist der Code zur Datenaufbereitung auch im Skript `merge_police_tweets.py` ausgeführt (identisch).
|
||||
|
||||
## Requirements
|
||||
|
||||
Um die Jupyter Umgebung dieses Repositorys zu nutzen empfiehlt sich auf die breits initialiserte nix-flake zurückzugreifen. Installationsanweisungen sind auf der Webseite https://jupyenv.io/documentation/getting-started/ Dokumentiert. Im wesentlich benötigt man:
|
||||
|
||||
- einen Nix daemon
|
||||
- Nix-Flakes (zu aktivieren in der nix.con)
|
||||
|
||||
und kann dann `nix run` ausführen.
|
333095
data/tweets_all_combined.csv
333095
data/tweets_all_combined.csv
File diff suppressed because it is too large
Load diff
|
@ -1,50 +0,0 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# Merging different table of old (~2021) and new (~2022) scraper
|
||||
|
||||
## cols: hashtag, url, mention (same for both)
|
||||
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper
|
||||
pd.read_csv("data/tweets.csv")]) # data from new scraper
|
||||
|
||||
## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
|
||||
tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),
|
||||
pd.read_csv("data/tweets-1679742698645.csv")])
|
||||
|
||||
## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
|
||||
tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),
|
||||
pd.read_csv("data/tweets-1679742620302.csv")])
|
||||
|
||||
## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
|
||||
## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
|
||||
tweets_user = pd.read_csv("data/user_old.tsv",
|
||||
sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names
|
||||
).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data
|
||||
).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names
|
||||
on = "user_id", # user_id as matching column
|
||||
how = "outer", # keep all unique uer_ids
|
||||
suffixes = ["_2021", "_2022"]) # identify column where username and label came from
|
||||
|
||||
## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
|
||||
tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
|
||||
user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)
|
||||
).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed
|
||||
|
||||
## addiditional information concerning the police stations
|
||||
## cols: handle, name, typ, bundesland, stadt, lat, long
|
||||
police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"
|
||||
).rename(columns = {"Polizei Account": "handle"})
|
||||
|
||||
# Merge statistics, tweet text and user information in one data frame
|
||||
tweets_combined = pd.merge(tweets_statistics,
|
||||
tweets_text,
|
||||
on = 'tweet_id').merge(tweets_user, on = 'user_id'
|
||||
).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)
|
||||
|
||||
# Convert datatypes to appropriate one
|
||||
tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
|
||||
tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format
|
||||
created_at = pd.to_datetime(tweets_combined['created_at']),
|
||||
handle = tweets_combined['handle'].str.lower(), # handle to lower case
|
||||
is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable
|
||||
tweets_combined.#to_csv("data/tweets_all_combined.csv")
|
|
@ -12,7 +12,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -37,7 +37,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": 117,
|
||||
"id": "fcc48831-7999-4d79-b722-736715b1ced6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -46,50 +46,48 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((479991, 3), (151690, 8), (151690, 4), (13327, 3), (163, 7))"
|
||||
"((479991, 3), (151690, 8), (151690, 4), (13327, 3))"
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"execution_count": 117,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Merging different table of old (~2021) and new (~2022) scraper\n",
|
||||
"\n",
|
||||
"## cols: hashtag, url, mention (same for both)\n",
|
||||
"tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n",
|
||||
" pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n",
|
||||
"\n",
|
||||
"## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)\n",
|
||||
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
|
||||
" 'tweet_text', \n",
|
||||
" 'created_at', \n",
|
||||
" 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
" pd.read_csv(\"data/tweets-1679742698645.csv\")])\n",
|
||||
"\n",
|
||||
"## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table\n",
|
||||
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
|
||||
" 'like_count', \n",
|
||||
" 'retweet_count', \n",
|
||||
" 'reply_count', \n",
|
||||
" 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
|
||||
" pd.read_csv(\"data/tweets-1679742620302.csv\")])\n",
|
||||
"\n",
|
||||
"## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)\n",
|
||||
"## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged\n",
|
||||
"tweets_user = pd.read_csv(\"data/user_old.tsv\", \n",
|
||||
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} # uniform names\n",
|
||||
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\" # merge with renamed new data\n",
|
||||
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}), # reverse col names\n",
|
||||
" on = \"user_id\", # user_id as matching column\n",
|
||||
" how = \"outer\", # keep all unique uer_ids\n",
|
||||
" suffixes = [\"_2021\", \"_2022\"]) # identify column where username and label came from\n",
|
||||
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"}\n",
|
||||
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\"\n",
|
||||
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}),\n",
|
||||
" on = \"user_id\",\n",
|
||||
" how = \"outer\",\n",
|
||||
" suffixes = [\"_2021\", \"_2022\"])\n",
|
||||
"\n",
|
||||
"## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.\n",
|
||||
"# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n",
|
||||
"tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n",
|
||||
" user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n",
|
||||
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed\n",
|
||||
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n",
|
||||
"\n",
|
||||
"## addiditional information concerning the police stations\n",
|
||||
"## cols: handle, name, typ, bundesland, stadt, lat, long\n",
|
||||
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" \n",
|
||||
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n",
|
||||
" ).rename(columns = {\"Polizei Account\": \"handle\"})\n",
|
||||
"\n",
|
||||
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape, police_stations.shape"
|
||||
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -102,14 +100,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 118,
|
||||
"id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Merge statistics, tweet text and user information in one data frame\n",
|
||||
"# Merge like statistics, tweet text and user information in one data frame\n",
|
||||
"tweets_combined = pd.merge(tweets_statistics, \n",
|
||||
" tweets_text,\n",
|
||||
" on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
|
||||
|
@ -119,12 +117,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"execution_count": 119,
|
||||
"id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
|
||||
" output = repr(obj)\n",
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
|
||||
" return method()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
|
@ -169,8 +177,8 @@
|
|||
" <td>2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>@mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@mahanna196 Da die Stadt keine Ausnahme für Ra...</td>\n",
|
||||
" <td>2020-10-27 09:29:13</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -184,7 +192,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@mahanna196 Ja. *sr</td>\n",
|
||||
" <td>2020-10-27 10:35:38</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
|
@ -199,8 +207,8 @@
|
|||
" <td>3</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausge...</td>\n",
|
||||
" <td>2020-10-27 12:36:26</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -214,8 +222,8 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/...</td>\n",
|
||||
" <td>2020-10-27 12:59:06</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -229,8 +237,8 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>In der vergangenen Woche wurde die Wohnung des...</td>\n",
|
||||
" <td>2020-10-27 13:57:32</td>\n",
|
||||
" <td>778895426007203840</td>\n",
|
||||
" <td>polizei_ol</td>\n",
|
||||
|
@ -260,7 +268,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-19 13:40:36</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze...</td>\n",
|
||||
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...</td>\n",
|
||||
" <td>2023-02-15 12:06:07</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -275,7 +283,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-25 13:14:49</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp; Drogen + ...</td>\n",
|
||||
" <td>Unser Präventionsteam vom #A44 berät heute und...</td>\n",
|
||||
" <td>2023-02-21 12:10:00</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -290,7 +298,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-25 13:14:49</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp; Drogen im Straßenverkehr beraten...</td>\n",
|
||||
" <td>Auch unser #A52 war heute aktiv und hat zum Th...</td>\n",
|
||||
" <td>2023-02-21 12:12:48</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -305,7 +313,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-26 13:15:05</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb...</td>\n",
|
||||
" <td>Gestern führte unser #A13 in einer Wohnsiedlun...</td>\n",
|
||||
" <td>2023-02-22 11:15:58</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -320,7 +328,7 @@
|
|||
" <td>0</td>\n",
|
||||
" <td>2023-02-27 12:17:33</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk...</td>\n",
|
||||
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr....</td>\n",
|
||||
" <td>2023-02-23 10:53:07</td>\n",
|
||||
" <td>1168873095614160896</td>\n",
|
||||
" <td>polizeiberlin_p</td>\n",
|
||||
|
@ -346,11 +354,11 @@
|
|||
"151689 1628709531998998529 10 1 0 \n",
|
||||
"\n",
|
||||
" quote_count measured_at is_deleted \\\n",
|
||||
"0 0 NaT <NA> \n",
|
||||
"1 0 NaT <NA> \n",
|
||||
"2 0 NaT <NA> \n",
|
||||
"3 0 NaT <NA> \n",
|
||||
"4 0 NaT <NA> \n",
|
||||
"0 0 NaT NaN \n",
|
||||
"1 0 NaT NaN \n",
|
||||
"2 0 NaT NaN \n",
|
||||
"3 0 NaT NaN \n",
|
||||
"4 0 NaT NaN \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 0 2023-02-19 13:40:36 False \n",
|
||||
"151686 0 2023-02-25 13:14:49 False \n",
|
||||
|
@ -358,31 +366,31 @@
|
|||
"151688 0 2023-02-26 13:15:05 False \n",
|
||||
"151689 0 2023-02-27 12:17:33 False \n",
|
||||
"\n",
|
||||
" tweet_text \\\n",
|
||||
"0 @mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a... \n",
|
||||
"1 @mahanna196 Ja. *sr \n",
|
||||
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F... \n",
|
||||
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr \n",
|
||||
"4 In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be... \n",
|
||||
"... ... \n",
|
||||
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze... \n",
|
||||
"151686 Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol & Drogen + ... \n",
|
||||
"151687 Auch unser #A52 war heute aktiv und hat zum Thema Alkohol & Drogen im Straßenverkehr beraten... \n",
|
||||
"151688 Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb... \n",
|
||||
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk... \n",
|
||||
" tweet_text created_at \\\n",
|
||||
"0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n",
|
||||
"1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n",
|
||||
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n",
|
||||
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n",
|
||||
"4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n",
|
||||
"... ... ... \n",
|
||||
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n",
|
||||
"151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n",
|
||||
"151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n",
|
||||
"151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n",
|
||||
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n",
|
||||
"\n",
|
||||
" created_at user_id handle \\\n",
|
||||
"0 2020-10-27 09:29:13 778895426007203840 polizei_ol \n",
|
||||
"1 2020-10-27 10:35:38 778895426007203840 polizei_ol \n",
|
||||
"2 2020-10-27 12:36:26 778895426007203840 polizei_ol \n",
|
||||
"3 2020-10-27 12:59:06 778895426007203840 polizei_ol \n",
|
||||
"4 2020-10-27 13:57:32 778895426007203840 polizei_ol \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 2023-02-15 12:06:07 1168873095614160896 polizeiberlin_p \n",
|
||||
"151686 2023-02-21 12:10:00 1168873095614160896 polizeiberlin_p \n",
|
||||
"151687 2023-02-21 12:12:48 1168873095614160896 polizeiberlin_p \n",
|
||||
"151688 2023-02-22 11:15:58 1168873095614160896 polizeiberlin_p \n",
|
||||
"151689 2023-02-23 10:53:07 1168873095614160896 polizeiberlin_p \n",
|
||||
" user_id handle \\\n",
|
||||
"0 778895426007203840 polizei_ol \n",
|
||||
"1 778895426007203840 polizei_ol \n",
|
||||
"2 778895426007203840 polizei_ol \n",
|
||||
"3 778895426007203840 polizei_ol \n",
|
||||
"4 778895426007203840 polizei_ol \n",
|
||||
"... ... ... \n",
|
||||
"151685 1168873095614160896 polizeiberlin_p \n",
|
||||
"151686 1168873095614160896 polizeiberlin_p \n",
|
||||
"151687 1168873095614160896 polizeiberlin_p \n",
|
||||
"151688 1168873095614160896 polizeiberlin_p \n",
|
||||
"151689 1168873095614160896 polizeiberlin_p \n",
|
||||
"\n",
|
||||
" user_name \n",
|
||||
"0 Polizei Oldenburg-Stadt/Ammerland \n",
|
||||
|
@ -400,19 +408,19 @@
|
|||
"[151690 rows x 12 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"execution_count": 119,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Convert datatypes to appropriate one\n",
|
||||
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)\n",
|
||||
"# Convert Counts to integer values\n",
|
||||
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n",
|
||||
"tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n",
|
||||
" created_at = pd.to_datetime(tweets_combined['created_at']),\n",
|
||||
" handle = tweets_combined['handle'].str.lower(), # handle to lower case\n",
|
||||
" is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable\n",
|
||||
"tweets_combined#.to_csv(\"data/tweets_all_combined.csv\")"
|
||||
" handle = tweets_combined['handle'].str.lower(),\n",
|
||||
" is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n",
|
||||
"tweets_combined"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -464,7 +472,7 @@
|
|||
"source": [
|
||||
"## Metadaten \n",
|
||||
"\n",
|
||||
"Welche Daten bilden die Grundlage?\n"
|
||||
"Welche Daten bilden die Grundlage?"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -529,7 +537,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 114,
|
||||
"id": "4f1e8c6c-3610-436e-899e-4d0307259230",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -560,7 +568,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 122,
|
||||
"id": "9373552e-6baf-46df-ae16-c63603e20a83",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -673,7 +681,7 @@
|
|||
"61 Hamburg 53.550341 10.000654 "
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 122,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -702,23 +710,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 123,
|
||||
"id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n",
|
||||
" for col_name, dtype in df.dtypes.iteritems():\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
"<div id=\"altair-viz-7b78525a62b243eca7b1f4044a328f47\"></div>\n",
|
||||
"<div id=\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\"></div>\n",
|
||||
"<script type=\"text/javascript\">\n",
|
||||
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
|
||||
" (function(spec, embedOpt){\n",
|
||||
" let outputDiv = document.currentScript.previousElementSibling;\n",
|
||||
" if (outputDiv.id !== \"altair-viz-7b78525a62b243eca7b1f4044a328f47\") {\n",
|
||||
" outputDiv = document.getElementById(\"altair-viz-7b78525a62b243eca7b1f4044a328f47\");\n",
|
||||
" if (outputDiv.id !== \"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\") {\n",
|
||||
" outputDiv = document.getElementById(\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\");\n",
|
||||
" }\n",
|
||||
" const paths = {\n",
|
||||
" \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
|
||||
|
@ -764,14 +780,14 @@
|
|||
" .catch(showError)\n",
|
||||
" .then(() => displayChart(vegaEmbed));\n",
|
||||
" }\n",
|
||||
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"ordinal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
|
||||
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"nominal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
|
||||
"</script>"
|
||||
],
|
||||
"text/plain": [
|
||||
"alt.Chart(...)"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 123,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -779,7 +795,7 @@
|
|||
"source": [
|
||||
"barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n",
|
||||
" x = 'count:Q',\n",
|
||||
" y = alt.Y('handle:O', sort = '-x'),\n",
|
||||
" y = alt.Y('handle:N', sort = '-x'),\n",
|
||||
")\n",
|
||||
"barchart "
|
||||
]
|
||||
|
@ -796,7 +812,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 125,
|
||||
"id": "d0549250-b11f-4762-8500-1134c53303b4",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
|
@ -804,377 +820,22 @@
|
|||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>index</th>\n",
|
||||
" <th>tweet_id</th>\n",
|
||||
" <th>like_count</th>\n",
|
||||
" <th>retweet_count</th>\n",
|
||||
" <th>reply_count</th>\n",
|
||||
" <th>quote_count</th>\n",
|
||||
" <th>measured_at</th>\n",
|
||||
" <th>is_deleted</th>\n",
|
||||
" <th>tweet_text</th>\n",
|
||||
" <th>created_at</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>handle</th>\n",
|
||||
" <th>user_name</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Typ</th>\n",
|
||||
" <th>Bundesland</th>\n",
|
||||
" <th>Stadt</th>\n",
|
||||
" <th>LAT</th>\n",
|
||||
" <th>LONG</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>3053</td>\n",
|
||||
" <td>1609539240458878979</td>\n",
|
||||
" <td>21455</td>\n",
|
||||
" <td>1845</td>\n",
|
||||
" <td>3643</td>\n",
|
||||
" <td>341</td>\n",
|
||||
" <td>2023-01-05 14:44:34</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une...</td>\n",
|
||||
" <td>2023-01-01 13:17:13</td>\n",
|
||||
" <td>2397974054</td>\n",
|
||||
" <td>polizeiberlin</td>\n",
|
||||
" <td>Polizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1331</td>\n",
|
||||
" <td>1355179228396879872</td>\n",
|
||||
" <td>19186</td>\n",
|
||||
" <td>3386</td>\n",
|
||||
" <td>1203</td>\n",
|
||||
" <td>628</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...</td>\n",
|
||||
" <td>2021-01-29 15:41:20</td>\n",
|
||||
" <td>2397974054</td>\n",
|
||||
" <td>polizeiberlin</td>\n",
|
||||
" <td>Polizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>91693</td>\n",
|
||||
" <td>1505620459148173316</td>\n",
|
||||
" <td>15708</td>\n",
|
||||
" <td>7098</td>\n",
|
||||
" <td>186</td>\n",
|
||||
" <td>540</td>\n",
|
||||
" <td>2022-03-24 20:15:08</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...</td>\n",
|
||||
" <td>2022-03-20 19:01:05</td>\n",
|
||||
" <td>2389161066</td>\n",
|
||||
" <td>polizei_nrw_bn</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Bonn</td>\n",
|
||||
" <td>50.735851</td>\n",
|
||||
" <td>7.10066</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>91695</td>\n",
|
||||
" <td>1505620666476896259</td>\n",
|
||||
" <td>10337</td>\n",
|
||||
" <td>1539</td>\n",
|
||||
" <td>59</td>\n",
|
||||
" <td>35</td>\n",
|
||||
" <td>2022-03-24 20:15:08</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...</td>\n",
|
||||
" <td>2022-03-20 19:01:54</td>\n",
|
||||
" <td>2389161066</td>\n",
|
||||
" <td>polizei_nrw_bn</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei NRW BN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Bonn</td>\n",
|
||||
" <td>50.735851</td>\n",
|
||||
" <td>7.10066</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>122631</td>\n",
|
||||
" <td>1359098196434292739</td>\n",
|
||||
" <td>9471</td>\n",
|
||||
" <td>642</td>\n",
|
||||
" <td>128</td>\n",
|
||||
" <td>102</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2</td>\n",
|
||||
" <td>2021-02-09 11:13:55</td>\n",
|
||||
" <td>4876039738</td>\n",
|
||||
" <td>bpol_b</td>\n",
|
||||
" <td>Bundespolizei Berlin</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151685</th>\n",
|
||||
" <td>7569</td>\n",
|
||||
" <td>1332625325654757377</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...</td>\n",
|
||||
" <td>2020-11-28 10:00:11</td>\n",
|
||||
" <td>223758384</td>\n",
|
||||
" <td>polizeisachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Sachsen</td>\n",
|
||||
" <td>Dresden</td>\n",
|
||||
" <td>51.0493286</td>\n",
|
||||
" <td>13.7381437</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151686</th>\n",
|
||||
" <td>7572</td>\n",
|
||||
" <td>1332738525507186692</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...</td>\n",
|
||||
" <td>2020-11-28 17:30:00</td>\n",
|
||||
" <td>223758384</td>\n",
|
||||
" <td>polizeisachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei Sachsen</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Sachsen</td>\n",
|
||||
" <td>Dresden</td>\n",
|
||||
" <td>51.0493286</td>\n",
|
||||
" <td>13.7381437</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151687</th>\n",
|
||||
" <td>144702</td>\n",
|
||||
" <td>1465679768494526467</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...</td>\n",
|
||||
" <td>2021-11-30 13:51:02</td>\n",
|
||||
" <td>4876085224</td>\n",
|
||||
" <td>bpol_nord</td>\n",
|
||||
" <td>Bundespolizei Nord</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151688</th>\n",
|
||||
" <td>144701</td>\n",
|
||||
" <td>1464124290605977600</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>@gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...</td>\n",
|
||||
" <td>2021-11-26 06:50:07</td>\n",
|
||||
" <td>4876085224</td>\n",
|
||||
" <td>bpol_nord</td>\n",
|
||||
" <td>Bundespolizei Nord</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>151689</th>\n",
|
||||
" <td>66854</td>\n",
|
||||
" <td>1376453040283209728</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>-99</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>#Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...</td>\n",
|
||||
" <td>2021-03-29 08:35:52</td>\n",
|
||||
" <td>2389263558</td>\n",
|
||||
" <td>polizei_nrw_un</td>\n",
|
||||
" <td>Polizei NRW UN</td>\n",
|
||||
" <td>Polizei NRW UN</td>\n",
|
||||
" <td>Polizei</td>\n",
|
||||
" <td>Nordrhein-Westfalen</td>\n",
|
||||
" <td>Unna</td>\n",
|
||||
" <td>51.5348835</td>\n",
|
||||
" <td>7.689014</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>151690 rows × 19 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" index tweet_id like_count retweet_count reply_count \\\n",
|
||||
"0 3053 1609539240458878979 21455 1845 3643 \n",
|
||||
"1 1331 1355179228396879872 19186 3386 1203 \n",
|
||||
"2 91693 1505620459148173316 15708 7098 186 \n",
|
||||
"3 91695 1505620666476896259 10337 1539 59 \n",
|
||||
"4 122631 1359098196434292739 9471 642 128 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"151685 7569 1332625325654757377 -99 -99 -99 \n",
|
||||
"151686 7572 1332738525507186692 -99 -99 -99 \n",
|
||||
"151687 144702 1465679768494526467 -99 -99 -99 \n",
|
||||
"151688 144701 1464124290605977600 -99 -99 -99 \n",
|
||||
"151689 66854 1376453040283209728 -99 -99 -99 \n",
|
||||
"\n",
|
||||
" quote_count measured_at is_deleted \\\n",
|
||||
"0 341 2023-01-05 14:44:34 False \n",
|
||||
"1 628 NaT NaN \n",
|
||||
"2 540 2022-03-24 20:15:08 False \n",
|
||||
"3 35 2022-03-24 20:15:08 False \n",
|
||||
"4 102 NaT NaN \n",
|
||||
"... ... ... ... \n",
|
||||
"151685 -99 NaT NaN \n",
|
||||
"151686 -99 NaT NaN \n",
|
||||
"151687 -99 NaT NaN \n",
|
||||
"151688 -99 NaT NaN \n",
|
||||
"151689 -99 NaT NaN \n",
|
||||
"\n",
|
||||
" tweet_text \\\n",
|
||||
"0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n",
|
||||
"1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n",
|
||||
"2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n",
|
||||
"3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n",
|
||||
"4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n",
|
||||
"... ... \n",
|
||||
" ... \n",
|
||||
"151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n",
|
||||
"151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n",
|
||||
"151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n",
|
||||
"151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n",
|
||||
"151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n",
|
||||
"\n",
|
||||
" created_at user_id handle user_name \\\n",
|
||||
"0 2023-01-01 13:17:13 2397974054 polizeiberlin Polizei Berlin \n",
|
||||
"1 2021-01-29 15:41:20 2397974054 polizeiberlin Polizei Berlin \n",
|
||||
"2 2022-03-20 19:01:05 2389161066 polizei_nrw_bn Polizei NRW BN \n",
|
||||
"3 2022-03-20 19:01:54 2389161066 polizei_nrw_bn Polizei NRW BN \n",
|
||||
"4 2021-02-09 11:13:55 4876039738 bpol_b Bundespolizei Berlin \n",
|
||||
"... ... ... ... ... \n",
|
||||
"151685 2020-11-28 10:00:11 223758384 polizeisachsen Polizei Sachsen \n",
|
||||
"151686 2020-11-28 17:30:00 223758384 polizeisachsen Polizei Sachsen \n",
|
||||
"151687 2021-11-30 13:51:02 4876085224 bpol_nord Bundespolizei Nord \n",
|
||||
"151688 2021-11-26 06:50:07 4876085224 bpol_nord Bundespolizei Nord \n",
|
||||
"151689 2021-03-29 08:35:52 2389263558 polizei_nrw_un Polizei NRW UN \n",
|
||||
"\n",
|
||||
" Name Typ Bundesland Stadt LAT \\\n",
|
||||
"0 NaN NaN NaN NaN NaN \n",
|
||||
"1 NaN NaN NaN NaN NaN \n",
|
||||
"2 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
|
||||
"3 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
|
||||
"4 NaN NaN NaN NaN NaN \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"151685 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
|
||||
"151686 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
|
||||
"151687 NaN NaN NaN NaN NaN \n",
|
||||
"151688 NaN NaN NaN NaN NaN \n",
|
||||
"151689 Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 \n",
|
||||
"\n",
|
||||
" LONG \n",
|
||||
"0 NaN \n",
|
||||
"1 NaN \n",
|
||||
"2 7.10066 \n",
|
||||
"3 7.10066 \n",
|
||||
"4 NaN \n",
|
||||
"... ... \n",
|
||||
"151685 13.7381437 \n",
|
||||
"151686 13.7381437 \n",
|
||||
"151687 NaN \n",
|
||||
"151688 NaN \n",
|
||||
"151689 7.689014 \n",
|
||||
"\n",
|
||||
"[151690 rows x 19 columns]"
|
||||
"Name: tweet_text, Length: 151690, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 125,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1184,14 +845,14 @@
|
|||
" on = \"handle\",\n",
|
||||
" how = \"left\")\n",
|
||||
"pd.options.display.max_colwidth = 100\n",
|
||||
"tweets_attention.sort_values('like_count', ascending = False).reset_index()\n",
|
||||
"tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "621a3b74-e909-435c-8820-b38b63aa4893",
|
||||
"execution_count": 90,
|
||||
"id": "97952234-7957-421e-bd2c-2c8261992c5a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
|
@ -1311,12 +972,144 @@
|
|||
"[11559 rows x 3 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"execution_count": 90,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
"source": [
|
||||
"old = pd.read_csv(\"data/user_old.tsv\",sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} )\n",
|
||||
"new = pd.read_csv(\"data/tweets-1679742702794.csv\").rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"})\n",
|
||||
"new"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 121,
|
||||
"id": "ed86b45e-9dd8-436d-9c96-15500ed93985",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th>count</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>user_name</th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>223758384</th>\n",
|
||||
" <th>Polizei Sachsen</th>\n",
|
||||
" <td>5340</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>259607457</th>\n",
|
||||
" <th>Polizei NRW K</th>\n",
|
||||
" <td>2544</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>424895827</th>\n",
|
||||
" <th>Polizei Stuttgart</th>\n",
|
||||
" <td>1913</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>769128278</th>\n",
|
||||
" <th>Polizei NRW DO</th>\n",
|
||||
" <td>4895</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>775664780</th>\n",
|
||||
" <th>Polizei Rostock</th>\n",
|
||||
" <td>604</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1169206134189830145</th>\n",
|
||||
" <th>Polizei Stendal</th>\n",
|
||||
" <td>842</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1184022676488314880</th>\n",
|
||||
" <th>Polizei Pforzheim</th>\n",
|
||||
" <td>283</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1184024283342950401</th>\n",
|
||||
" <th>Polizei Ravensburg</th>\n",
|
||||
" <td>460</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1232548941889228808</th>\n",
|
||||
" <th>Systemstratege:</th>\n",
|
||||
" <td>168</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1295978598034284546</th>\n",
|
||||
" <th>Polizei ZPD NI</th>\n",
|
||||
" <td>133</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>163 rows × 1 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" count\n",
|
||||
"user_id user_name \n",
|
||||
"223758384 Polizei Sachsen 5340\n",
|
||||
"259607457 Polizei NRW K 2544\n",
|
||||
"424895827 Polizei Stuttgart 1913\n",
|
||||
"769128278 Polizei NRW DO 4895\n",
|
||||
"775664780 Polizei Rostock 604\n",
|
||||
"... ...\n",
|
||||
"1169206134189830145 Polizei Stendal 842\n",
|
||||
"1184022676488314880 Polizei Pforzheim 283\n",
|
||||
"1184024283342950401 Polizei Ravensburg 460\n",
|
||||
"1232548941889228808 Systemstratege: 168\n",
|
||||
"1295978598034284546 Polizei ZPD NI 133\n",
|
||||
"\n",
|
||||
"[163 rows x 1 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 121,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n",
|
||||
" )[\"user_id\"].aggregate(['count']\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
Loading…
Reference in a new issue