Compare commits

..

No commits in common. "36de4fdf81e939243787fdbffedc366da4437e9f" and "abe05ce248d72f9d7027dba1972c8b163a771630" have entirely different histories.

8 changed files with 529 additions and 334150 deletions

View file

@ -1,50 +0,0 @@
import numpy as np
import pandas as pd
# Merging different table of old (~2021) and new (~2022) scraper
## cols: hashtag, url, mention (same for both)
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper
pd.read_csv("data/tweets.csv")]) # data from new scraper
## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),
pd.read_csv("data/tweets-1679742698645.csv")])
## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),
pd.read_csv("data/tweets-1679742620302.csv")])
## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
tweets_user = pd.read_csv("data/user_old.tsv",
sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names
).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data
).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names
on = "user_id", # user_id as matching column
how = "outer", # keep all unique uer_ids
suffixes = ["_2021", "_2022"]) # identify column where username and label came from
## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)
).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed
## addiditional information concerning the police stations
## cols: handle, name, typ, bundesland, stadt, lat, long
police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"
).rename(columns = {"Polizei Account": "handle"})
# Merge statistics, tweet text and user information in one data frame
tweets_combined = pd.merge(tweets_statistics,
tweets_text,
on = 'tweet_id').merge(tweets_user, on = 'user_id'
).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)
# Convert datatypes to appropriate one
tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format
created_at = pd.to_datetime(tweets_combined['created_at']),
handle = tweets_combined['handle'].str.lower(), # handle to lower case
is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable
tweets_combined.#to_csv("data/tweets_all_combined.csv")

View file

@ -12,7 +12,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2", "id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -37,7 +37,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 117,
"id": "fcc48831-7999-4d79-b722-736715b1ced6", "id": "fcc48831-7999-4d79-b722-736715b1ced6",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -46,50 +46,48 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"((479991, 3), (151690, 8), (151690, 4), (13327, 3), (163, 7))" "((479991, 3), (151690, 8), (151690, 4), (13327, 3))"
] ]
}, },
"execution_count": 45, "execution_count": 117,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Merging different table of old (~2021) and new (~2022) scraper\n",
"\n",
"## cols: hashtag, url, mention (same for both)\n",
"tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n", "tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n",
" pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n", " pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n",
"\n", "\n",
"## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)\n", "tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n", " 'tweet_text', \n",
" 'created_at', \n",
" 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
" pd.read_csv(\"data/tweets-1679742698645.csv\")])\n", " pd.read_csv(\"data/tweets-1679742698645.csv\")])\n",
"\n", "\n",
"## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table\n", "tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n", " 'like_count', \n",
" 'retweet_count', \n",
" 'reply_count', \n",
" 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
" pd.read_csv(\"data/tweets-1679742620302.csv\")])\n", " pd.read_csv(\"data/tweets-1679742620302.csv\")])\n",
"\n", "\n",
"## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)\n",
"## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged\n",
"tweets_user = pd.read_csv(\"data/user_old.tsv\", \n", "tweets_user = pd.read_csv(\"data/user_old.tsv\", \n",
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} # uniform names\n", " sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"}\n",
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\" # merge with renamed new data\n", " ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\"\n",
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}), # reverse col names\n", " ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}),\n",
" on = \"user_id\", # user_id as matching column\n", " on = \"user_id\",\n",
" how = \"outer\", # keep all unique uer_ids\n", " how = \"outer\",\n",
" suffixes = [\"_2021\", \"_2022\"]) # identify column where username and label came from\n", " suffixes = [\"_2021\", \"_2022\"])\n",
"\n", "\n",
"## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.\n", "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n",
"tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n", "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n",
" user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n", " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n",
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed\n", " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n",
"\n", "\n",
"## addiditional information concerning the police stations\n", "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n",
"## cols: handle, name, typ, bundesland, stadt, lat, long\n",
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" \n",
" ).rename(columns = {\"Polizei Account\": \"handle\"})\n", " ).rename(columns = {\"Polizei Account\": \"handle\"})\n",
"\n", "\n",
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape, police_stations.shape" "tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape"
] ]
}, },
{ {
@ -102,14 +100,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 118,
"id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3", "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# Merge statistics, tweet text and user information in one data frame\n", "# Merge like statistics, tweet text and user information in one data frame\n",
"tweets_combined = pd.merge(tweets_statistics, \n", "tweets_combined = pd.merge(tweets_statistics, \n",
" tweets_text,\n", " tweets_text,\n",
" on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
@ -119,12 +117,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 119,
"id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d", "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
" output = repr(obj)\n",
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
" return method()\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
@ -169,8 +177,8 @@
" <td>2</td>\n", " <td>2</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a...</td>\n", " <td>@mahanna196 Da die Stadt keine Ausnahme für Ra...</td>\n",
" <td>2020-10-27 09:29:13</td>\n", " <td>2020-10-27 09:29:13</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -184,7 +192,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@mahanna196 Ja. *sr</td>\n", " <td>@mahanna196 Ja. *sr</td>\n",
" <td>2020-10-27 10:35:38</td>\n", " <td>2020-10-27 10:35:38</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
@ -199,8 +207,8 @@
" <td>3</td>\n", " <td>3</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F...</td>\n", " <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausge...</td>\n",
" <td>2020-10-27 12:36:26</td>\n", " <td>2020-10-27 12:36:26</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -214,8 +222,8 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr</td>\n", " <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/...</td>\n",
" <td>2020-10-27 12:59:06</td>\n", " <td>2020-10-27 12:59:06</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -229,8 +237,8 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be...</td>\n", " <td>In der vergangenen Woche wurde die Wohnung des...</td>\n",
" <td>2020-10-27 13:57:32</td>\n", " <td>2020-10-27 13:57:32</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -260,7 +268,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-19 13:40:36</td>\n", " <td>2023-02-19 13:40:36</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze...</td>\n", " <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...</td>\n",
" <td>2023-02-15 12:06:07</td>\n", " <td>2023-02-15 12:06:07</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -275,7 +283,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-25 13:14:49</td>\n", " <td>2023-02-25 13:14:49</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp;amp; Drogen + ...</td>\n", " <td>Unser Präventionsteam vom #A44 berät heute und...</td>\n",
" <td>2023-02-21 12:10:00</td>\n", " <td>2023-02-21 12:10:00</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -290,7 +298,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-25 13:14:49</td>\n", " <td>2023-02-25 13:14:49</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp;amp; Drogen im Straßenverkehr beraten...</td>\n", " <td>Auch unser #A52 war heute aktiv und hat zum Th...</td>\n",
" <td>2023-02-21 12:12:48</td>\n", " <td>2023-02-21 12:12:48</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -305,7 +313,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-26 13:15:05</td>\n", " <td>2023-02-26 13:15:05</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb...</td>\n", " <td>Gestern führte unser #A13 in einer Wohnsiedlun...</td>\n",
" <td>2023-02-22 11:15:58</td>\n", " <td>2023-02-22 11:15:58</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -320,7 +328,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-27 12:17:33</td>\n", " <td>2023-02-27 12:17:33</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk...</td>\n", " <td>Auf dem Gelände der @BUFAStudios (Oberlandstr....</td>\n",
" <td>2023-02-23 10:53:07</td>\n", " <td>2023-02-23 10:53:07</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -346,11 +354,11 @@
"151689 1628709531998998529 10 1 0 \n", "151689 1628709531998998529 10 1 0 \n",
"\n", "\n",
" quote_count measured_at is_deleted \\\n", " quote_count measured_at is_deleted \\\n",
"0 0 NaT <NA> \n", "0 0 NaT NaN \n",
"1 0 NaT <NA> \n", "1 0 NaT NaN \n",
"2 0 NaT <NA> \n", "2 0 NaT NaN \n",
"3 0 NaT <NA> \n", "3 0 NaT NaN \n",
"4 0 NaT <NA> \n", "4 0 NaT NaN \n",
"... ... ... ... \n", "... ... ... ... \n",
"151685 0 2023-02-19 13:40:36 False \n", "151685 0 2023-02-19 13:40:36 False \n",
"151686 0 2023-02-25 13:14:49 False \n", "151686 0 2023-02-25 13:14:49 False \n",
@ -358,31 +366,31 @@
"151688 0 2023-02-26 13:15:05 False \n", "151688 0 2023-02-26 13:15:05 False \n",
"151689 0 2023-02-27 12:17:33 False \n", "151689 0 2023-02-27 12:17:33 False \n",
"\n", "\n",
" tweet_text \\\n", " tweet_text created_at \\\n",
"0 @mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a... \n", "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n",
"1 @mahanna196 Ja. *sr \n", "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n",
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F... \n", "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n",
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr \n", "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n",
"4 In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be... \n", "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n",
"... ... \n", "... ... ... \n",
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze... \n", "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n",
"151686 Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp; Drogen + ... \n", "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n",
"151687 Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp; Drogen im Straßenverkehr beraten... \n", "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n",
"151688 Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb... \n", "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n",
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk... \n", "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n",
"\n", "\n",
" created_at user_id handle \\\n", " user_id handle \\\n",
"0 2020-10-27 09:29:13 778895426007203840 polizei_ol \n", "0 778895426007203840 polizei_ol \n",
"1 2020-10-27 10:35:38 778895426007203840 polizei_ol \n", "1 778895426007203840 polizei_ol \n",
"2 2020-10-27 12:36:26 778895426007203840 polizei_ol \n", "2 778895426007203840 polizei_ol \n",
"3 2020-10-27 12:59:06 778895426007203840 polizei_ol \n", "3 778895426007203840 polizei_ol \n",
"4 2020-10-27 13:57:32 778895426007203840 polizei_ol \n", "4 778895426007203840 polizei_ol \n",
"... ... ... ... \n", "... ... ... \n",
"151685 2023-02-15 12:06:07 1168873095614160896 polizeiberlin_p \n", "151685 1168873095614160896 polizeiberlin_p \n",
"151686 2023-02-21 12:10:00 1168873095614160896 polizeiberlin_p \n", "151686 1168873095614160896 polizeiberlin_p \n",
"151687 2023-02-21 12:12:48 1168873095614160896 polizeiberlin_p \n", "151687 1168873095614160896 polizeiberlin_p \n",
"151688 2023-02-22 11:15:58 1168873095614160896 polizeiberlin_p \n", "151688 1168873095614160896 polizeiberlin_p \n",
"151689 2023-02-23 10:53:07 1168873095614160896 polizeiberlin_p \n", "151689 1168873095614160896 polizeiberlin_p \n",
"\n", "\n",
" user_name \n", " user_name \n",
"0 Polizei Oldenburg-Stadt/Ammerland \n", "0 Polizei Oldenburg-Stadt/Ammerland \n",
@ -400,19 +408,19 @@
"[151690 rows x 12 columns]" "[151690 rows x 12 columns]"
] ]
}, },
"execution_count": 49, "execution_count": 119,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Convert datatypes to appropriate one\n", "# Convert Counts to integer values\n",
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)\n", "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n",
"tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n",
" created_at = pd.to_datetime(tweets_combined['created_at']),\n", " created_at = pd.to_datetime(tweets_combined['created_at']),\n",
" handle = tweets_combined['handle'].str.lower(), # handle to lower case\n", " handle = tweets_combined['handle'].str.lower(),\n",
" is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable\n", " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n",
"tweets_combined#.to_csv(\"data/tweets_all_combined.csv\")" "tweets_combined"
] ]
}, },
{ {
@ -464,7 +472,7 @@
"source": [ "source": [
"## Metadaten \n", "## Metadaten \n",
"\n", "\n",
"Welche Daten bilden die Grundlage?\n" "Welche Daten bilden die Grundlage?"
] ]
}, },
{ {
@ -529,7 +537,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 114,
"id": "4f1e8c6c-3610-436e-899e-4d0307259230", "id": "4f1e8c6c-3610-436e-899e-4d0307259230",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -544,7 +552,7 @@
} }
], ],
"source": [ "source": [
"print(\"Die Tweets wurden vom\", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n",
"# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag"
] ]
}, },
@ -560,7 +568,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 122,
"id": "9373552e-6baf-46df-ae16-c63603e20a83", "id": "9373552e-6baf-46df-ae16-c63603e20a83",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -673,7 +681,7 @@
"61 Hamburg 53.550341 10.000654 " "61 Hamburg 53.550341 10.000654 "
] ]
}, },
"execution_count": 11, "execution_count": 122,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -702,23 +710,31 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 123,
"id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f", "id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n",
" for col_name, dtype in df.dtypes.iteritems():\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
"\n", "\n",
"<div id=\"altair-viz-7b78525a62b243eca7b1f4044a328f47\"></div>\n", "<div id=\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\"></div>\n",
"<script type=\"text/javascript\">\n", "<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n", " var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n", " (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n", " let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-7b78525a62b243eca7b1f4044a328f47\") {\n", " if (outputDiv.id !== \"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\") {\n",
" outputDiv = document.getElementById(\"altair-viz-7b78525a62b243eca7b1f4044a328f47\");\n", " outputDiv = document.getElementById(\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\");\n",
" }\n", " }\n",
" const paths = {\n", " const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
@ -764,14 +780,14 @@
" .catch(showError)\n", " .catch(showError)\n",
" .then(() => displayChart(vegaEmbed));\n", " .then(() => displayChart(vegaEmbed));\n",
" }\n", " }\n",
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"ordinal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n", " })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"nominal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
"</script>" "</script>"
], ],
"text/plain": [ "text/plain": [
"alt.Chart(...)" "alt.Chart(...)"
] ]
}, },
"execution_count": 13, "execution_count": 123,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -779,7 +795,7 @@
"source": [ "source": [
"barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n", "barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n",
" x = 'count:Q',\n", " x = 'count:Q',\n",
" y = alt.Y('handle:O', sort = '-x'),\n", " y = alt.Y('handle:N', sort = '-x'),\n",
")\n", ")\n",
"barchart " "barchart "
] ]
@ -796,7 +812,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 125,
"id": "d0549250-b11f-4762-8500-1134c53303b4", "id": "d0549250-b11f-4762-8500-1134c53303b4",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -804,377 +820,22 @@
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>tweet_id</th>\n",
" <th>like_count</th>\n",
" <th>retweet_count</th>\n",
" <th>reply_count</th>\n",
" <th>quote_count</th>\n",
" <th>measured_at</th>\n",
" <th>is_deleted</th>\n",
" <th>tweet_text</th>\n",
" <th>created_at</th>\n",
" <th>user_id</th>\n",
" <th>handle</th>\n",
" <th>user_name</th>\n",
" <th>Name</th>\n",
" <th>Typ</th>\n",
" <th>Bundesland</th>\n",
" <th>Stadt</th>\n",
" <th>LAT</th>\n",
" <th>LONG</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3053</td>\n",
" <td>1609539240458878979</td>\n",
" <td>21455</td>\n",
" <td>1845</td>\n",
" <td>3643</td>\n",
" <td>341</td>\n",
" <td>2023-01-05 14:44:34</td>\n",
" <td>False</td>\n",
" <td>Die Gewalt, die unsere Kolleginnen &amp;amp; Kollegen in der Silvesternacht erleben mussten, ist une...</td>\n",
" <td>2023-01-01 13:17:13</td>\n",
" <td>2397974054</td>\n",
" <td>polizeiberlin</td>\n",
" <td>Polizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1331</td>\n",
" <td>1355179228396879872</td>\n",
" <td>19186</td>\n",
" <td>3386</td>\n",
" <td>1203</td>\n",
" <td>628</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...</td>\n",
" <td>2021-01-29 15:41:20</td>\n",
" <td>2397974054</td>\n",
" <td>polizeiberlin</td>\n",
" <td>Polizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>91693</td>\n",
" <td>1505620459148173316</td>\n",
" <td>15708</td>\n",
" <td>7098</td>\n",
" <td>186</td>\n",
" <td>540</td>\n",
" <td>2022-03-24 20:15:08</td>\n",
" <td>False</td>\n",
" <td>WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...</td>\n",
" <td>2022-03-20 19:01:05</td>\n",
" <td>2389161066</td>\n",
" <td>polizei_nrw_bn</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Bonn</td>\n",
" <td>50.735851</td>\n",
" <td>7.10066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>91695</td>\n",
" <td>1505620666476896259</td>\n",
" <td>10337</td>\n",
" <td>1539</td>\n",
" <td>59</td>\n",
" <td>35</td>\n",
" <td>2022-03-24 20:15:08</td>\n",
" <td>False</td>\n",
" <td>Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...</td>\n",
" <td>2022-03-20 19:01:54</td>\n",
" <td>2389161066</td>\n",
" <td>polizei_nrw_bn</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Bonn</td>\n",
" <td>50.735851</td>\n",
" <td>7.10066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>122631</td>\n",
" <td>1359098196434292739</td>\n",
" <td>9471</td>\n",
" <td>642</td>\n",
" <td>128</td>\n",
" <td>102</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2</td>\n",
" <td>2021-02-09 11:13:55</td>\n",
" <td>4876039738</td>\n",
" <td>bpol_b</td>\n",
" <td>Bundespolizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151685</th>\n",
" <td>7569</td>\n",
" <td>1332625325654757377</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...</td>\n",
" <td>2020-11-28 10:00:11</td>\n",
" <td>223758384</td>\n",
" <td>polizeisachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei</td>\n",
" <td>Sachsen</td>\n",
" <td>Dresden</td>\n",
" <td>51.0493286</td>\n",
" <td>13.7381437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151686</th>\n",
" <td>7572</td>\n",
" <td>1332738525507186692</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...</td>\n",
" <td>2020-11-28 17:30:00</td>\n",
" <td>223758384</td>\n",
" <td>polizeisachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei</td>\n",
" <td>Sachsen</td>\n",
" <td>Dresden</td>\n",
" <td>51.0493286</td>\n",
" <td>13.7381437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151687</th>\n",
" <td>144702</td>\n",
" <td>1465679768494526467</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...</td>\n",
" <td>2021-11-30 13:51:02</td>\n",
" <td>4876085224</td>\n",
" <td>bpol_nord</td>\n",
" <td>Bundespolizei Nord</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151688</th>\n",
" <td>144701</td>\n",
" <td>1464124290605977600</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>@gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...</td>\n",
" <td>2021-11-26 06:50:07</td>\n",
" <td>4876085224</td>\n",
" <td>bpol_nord</td>\n",
" <td>Bundespolizei Nord</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151689</th>\n",
" <td>66854</td>\n",
" <td>1376453040283209728</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>#Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...</td>\n",
" <td>2021-03-29 08:35:52</td>\n",
" <td>2389263558</td>\n",
" <td>polizei_nrw_un</td>\n",
" <td>Polizei NRW UN</td>\n",
" <td>Polizei NRW UN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Unna</td>\n",
" <td>51.5348835</td>\n",
" <td>7.689014</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>151690 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
" index tweet_id like_count retweet_count reply_count \\\n", "0 Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une...\n",
"0 3053 1609539240458878979 21455 1845 3643 \n", "1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n",
"1 1331 1355179228396879872 19186 3386 1203 \n", "2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n",
"2 91693 1505620459148173316 15708 7098 186 \n", "3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n",
"3 91695 1505620666476896259 10337 1539 59 \n", "4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n",
"4 122631 1359098196434292739 9471 642 128 \n", " ... \n",
"... ... ... ... ... ... \n", "151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n",
"151685 7569 1332625325654757377 -99 -99 -99 \n", "151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n",
"151686 7572 1332738525507186692 -99 -99 -99 \n", "151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n",
"151687 144702 1465679768494526467 -99 -99 -99 \n", "151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n",
"151688 144701 1464124290605977600 -99 -99 -99 \n", "151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n",
"151689 66854 1376453040283209728 -99 -99 -99 \n", "Name: tweet_text, Length: 151690, dtype: object"
"\n",
" quote_count measured_at is_deleted \\\n",
"0 341 2023-01-05 14:44:34 False \n",
"1 628 NaT NaN \n",
"2 540 2022-03-24 20:15:08 False \n",
"3 35 2022-03-24 20:15:08 False \n",
"4 102 NaT NaN \n",
"... ... ... ... \n",
"151685 -99 NaT NaN \n",
"151686 -99 NaT NaN \n",
"151687 -99 NaT NaN \n",
"151688 -99 NaT NaN \n",
"151689 -99 NaT NaN \n",
"\n",
" tweet_text \\\n",
"0 Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une... \n",
"1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T... \n",
"2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a... \n",
"3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da... \n",
"4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2 \n",
"... ... \n",
"151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ... \n",
"151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ... \n",
"151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV... \n",
"151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt... \n",
"151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F... \n",
"\n",
" created_at user_id handle user_name \\\n",
"0 2023-01-01 13:17:13 2397974054 polizeiberlin Polizei Berlin \n",
"1 2021-01-29 15:41:20 2397974054 polizeiberlin Polizei Berlin \n",
"2 2022-03-20 19:01:05 2389161066 polizei_nrw_bn Polizei NRW BN \n",
"3 2022-03-20 19:01:54 2389161066 polizei_nrw_bn Polizei NRW BN \n",
"4 2021-02-09 11:13:55 4876039738 bpol_b Bundespolizei Berlin \n",
"... ... ... ... ... \n",
"151685 2020-11-28 10:00:11 223758384 polizeisachsen Polizei Sachsen \n",
"151686 2020-11-28 17:30:00 223758384 polizeisachsen Polizei Sachsen \n",
"151687 2021-11-30 13:51:02 4876085224 bpol_nord Bundespolizei Nord \n",
"151688 2021-11-26 06:50:07 4876085224 bpol_nord Bundespolizei Nord \n",
"151689 2021-03-29 08:35:52 2389263558 polizei_nrw_un Polizei NRW UN \n",
"\n",
" Name Typ Bundesland Stadt LAT \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
"3 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
"4 NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... \n",
"151685 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
"151686 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
"151687 NaN NaN NaN NaN NaN \n",
"151688 NaN NaN NaN NaN NaN \n",
"151689 Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 \n",
"\n",
" LONG \n",
"0 NaN \n",
"1 NaN \n",
"2 7.10066 \n",
"3 7.10066 \n",
"4 NaN \n",
"... ... \n",
"151685 13.7381437 \n",
"151686 13.7381437 \n",
"151687 NaN \n",
"151688 NaN \n",
"151689 7.689014 \n",
"\n",
"[151690 rows x 19 columns]"
] ]
}, },
"execution_count": 14, "execution_count": 125,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1184,14 +845,14 @@
" on = \"handle\",\n", " on = \"handle\",\n",
" how = \"left\")\n", " how = \"left\")\n",
"pd.options.display.max_colwidth = 100\n", "pd.options.display.max_colwidth = 100\n",
"tweets_attention.sort_values('like_count', ascending = False).reset_index()\n", "tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n",
"\n" "\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 42, "execution_count": 90,
"id": "621a3b74-e909-435c-8820-b38b63aa4893", "id": "97952234-7957-421e-bd2c-2c8261992c5a",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
@ -1311,12 +972,144 @@
"[11559 rows x 3 columns]" "[11559 rows x 3 columns]"
] ]
}, },
"execution_count": 42, "execution_count": 90,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [] "source": [
"old = pd.read_csv(\"data/user_old.tsv\",sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} )\n",
"new = pd.read_csv(\"data/tweets-1679742702794.csv\").rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"})\n",
"new"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "ed86b45e-9dd8-436d-9c96-15500ed93985",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>count</th>\n",
" </tr>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>223758384</th>\n",
" <th>Polizei Sachsen</th>\n",
" <td>5340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259607457</th>\n",
" <th>Polizei NRW K</th>\n",
" <td>2544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>424895827</th>\n",
" <th>Polizei Stuttgart</th>\n",
" <td>1913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>769128278</th>\n",
" <th>Polizei NRW DO</th>\n",
" <td>4895</td>\n",
" </tr>\n",
" <tr>\n",
" <th>775664780</th>\n",
" <th>Polizei Rostock</th>\n",
" <td>604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1169206134189830145</th>\n",
" <th>Polizei Stendal</th>\n",
" <td>842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1184022676488314880</th>\n",
" <th>Polizei Pforzheim</th>\n",
" <td>283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1184024283342950401</th>\n",
" <th>Polizei Ravensburg</th>\n",
" <td>460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1232548941889228808</th>\n",
" <th>Systemstratege:</th>\n",
" <td>168</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1295978598034284546</th>\n",
" <th>Polizei ZPD NI</th>\n",
" <td>133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>163 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" count\n",
"user_id user_name \n",
"223758384 Polizei Sachsen 5340\n",
"259607457 Polizei NRW K 2544\n",
"424895827 Polizei Stuttgart 1913\n",
"769128278 Polizei NRW DO 4895\n",
"775664780 Polizei Rostock 604\n",
"... ...\n",
"1169206134189830145 Polizei Stendal 842\n",
"1184022676488314880 Polizei Pforzheim 283\n",
"1184024283342950401 Polizei Ravensburg 460\n",
"1232548941889228808 Systemstratege: 168\n",
"1295978598034284546 Polizei ZPD NI 133\n",
"\n",
"[163 rows x 1 columns]"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n",
" )[\"user_id\"].aggregate(['count']\n",
" )"
]
} }
], ],
"metadata": { "metadata": {

Binary file not shown.

View file

@ -1 +1 @@
{"data":{"layout-restorer:data":{"main":{"dock":{"type":"split-area","orientation":"horizontal","sizes":[0.9093610698365527,0.09063893016344725],"children":[{"type":"tab-area","currentIndex":0,"widgets":["notebook:zusammenfassung.ipynb","editor:merge_police_tweets.py"]},{"type":"tab-area","currentIndex":0,"widgets":["inspector:inspector"]}]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.26146046543024176,0.7385395345697582,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}},"inspector:inspector":{"data":{}},"editor:merge_police_tweets.py":{"data":{"path":"merge_police_tweets.py","factory":"Editor"}}},"metadata":{"id":"default"}} {"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:zusammenfassung.ipynb"]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17943235504652827,0.8205676449534718,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}}

View file

@ -1,12 +0,0 @@
# Copbird Webside Project
In diesem Repository sollen relavante Daten und Code für die Copbird Webseite gesammelt werden. Alle Daten befinden sich im Verzeichnis `data/`. Im Verzeichnis `ergebnisse_hackathon_repo/` befindet sich ein fork des Ergbnis-Repo vom Hackathon. Im Jupyter-Notebook `zusammenfassung.ipynb` sollten relevante Analyse Schritte Dokumentiert werden, bisher ist dort aber vornehmlich eine erneute Datenaufbereitung Dokumentiert. Zu besseren Interoperabilität, ist der Code zur Datenaufbereitung auch im Skript `merge_police_tweets.py` ausgeführt (identisch).
## Requirements
Um die Jupyter Umgebung dieses Repositorys zu nutzen empfiehlt sich auf die breits initialiserte nix-flake zurückzugreifen. Installationsanweisungen sind auf der Webseite https://jupyenv.io/documentation/getting-started/ Dokumentiert. Im wesentlich benötigt man:
- einen Nix daemon
- Nix-Flakes (zu aktivieren in der nix.con)
und kann dann `nix run` ausführen.

File diff suppressed because it is too large Load diff

View file

@ -1,50 +0,0 @@
import numpy as np
import pandas as pd
# Merging different table of old (~2021) and new (~2022) scraper
## cols: hashtag, url, mention (same for both)
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper
pd.read_csv("data/tweets.csv")]) # data from new scraper
## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),
pd.read_csv("data/tweets-1679742698645.csv")])
## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),
pd.read_csv("data/tweets-1679742620302.csv")])
## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
tweets_user = pd.read_csv("data/user_old.tsv",
sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names
).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data
).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names
on = "user_id", # user_id as matching column
how = "outer", # keep all unique uer_ids
suffixes = ["_2021", "_2022"]) # identify column where username and label came from
## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)
).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed
## addiditional information concerning the police stations
## cols: handle, name, typ, bundesland, stadt, lat, long
police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"
).rename(columns = {"Polizei Account": "handle"})
# Merge statistics, tweet text and user information in one data frame
tweets_combined = pd.merge(tweets_statistics,
tweets_text,
on = 'tweet_id').merge(tweets_user, on = 'user_id'
).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)
# Convert datatypes to appropriate one
tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format
created_at = pd.to_datetime(tweets_combined['created_at']),
handle = tweets_combined['handle'].str.lower(), # handle to lower case
is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable
tweets_combined.#to_csv("data/tweets_all_combined.csv")

View file

@ -12,7 +12,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2", "id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -37,7 +37,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 117,
"id": "fcc48831-7999-4d79-b722-736715b1ced6", "id": "fcc48831-7999-4d79-b722-736715b1ced6",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -46,50 +46,48 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"((479991, 3), (151690, 8), (151690, 4), (13327, 3), (163, 7))" "((479991, 3), (151690, 8), (151690, 4), (13327, 3))"
] ]
}, },
"execution_count": 45, "execution_count": 117,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Merging different table of old (~2021) and new (~2022) scraper\n",
"\n",
"## cols: hashtag, url, mention (same for both)\n",
"tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n", "tweets_meta = pd.concat([pd.read_csv(\"data/entity_old.tsv\", sep = \"\\t\"), # data from old scraper\n",
" pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n", " pd.read_csv(\"data/tweets.csv\")]) # data from new scraper\n",
"\n", "\n",
"## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)\n", "tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
"tweets_text = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n", " 'tweet_text', \n",
" 'created_at', \n",
" 'user_id']].rename(columns = {\"id\":\"tweet_id\"}),\n",
" pd.read_csv(\"data/tweets-1679742698645.csv\")])\n", " pd.read_csv(\"data/tweets-1679742698645.csv\")])\n",
"\n", "\n",
"## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table\n", "tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', \n",
"tweets_statistics = pd.concat([pd.read_csv(\"data/tweet_old.tsv\", sep = \"\\t\")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n", " 'like_count', \n",
" 'retweet_count', \n",
" 'reply_count', \n",
" 'quote_count']].rename(columns = {\"id\":\"tweet_id\"}),\n",
" pd.read_csv(\"data/tweets-1679742620302.csv\")])\n", " pd.read_csv(\"data/tweets-1679742620302.csv\")])\n",
"\n", "\n",
"## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)\n",
"## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged\n",
"tweets_user = pd.read_csv(\"data/user_old.tsv\", \n", "tweets_user = pd.read_csv(\"data/user_old.tsv\", \n",
" sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} # uniform names\n", " sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"}\n",
" ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\" # merge with renamed new data\n", " ).merge(pd.read_csv(\"data/tweets-1679742702794.csv\"\n",
" ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}), # reverse col names\n", " ).rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"}),\n",
" on = \"user_id\", # user_id as matching column\n", " on = \"user_id\",\n",
" how = \"outer\", # keep all unique uer_ids\n", " how = \"outer\",\n",
" suffixes = [\"_2021\", \"_2022\"]) # identify column where username and label came from\n", " suffixes = [\"_2021\", \"_2022\"])\n",
"\n", "\n",
"## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.\n", "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n",
"tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n", "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n",
" user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n", " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n",
" ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed\n", " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n",
"\n", "\n",
"## addiditional information concerning the police stations\n", "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n",
"## cols: handle, name, typ, bundesland, stadt, lat, long\n",
"police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" \n",
" ).rename(columns = {\"Polizei Account\": \"handle\"})\n", " ).rename(columns = {\"Polizei Account\": \"handle\"})\n",
"\n", "\n",
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape, police_stations.shape" "tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape"
] ]
}, },
{ {
@ -102,14 +100,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 118,
"id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3", "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# Merge statistics, tweet text and user information in one data frame\n", "# Merge like statistics, tweet text and user information in one data frame\n",
"tweets_combined = pd.merge(tweets_statistics, \n", "tweets_combined = pd.merge(tweets_statistics, \n",
" tweets_text,\n", " tweets_text,\n",
" on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
@ -119,12 +117,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 119,
"id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d", "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
" output = repr(obj)\n",
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
" return method()\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
@ -169,8 +177,8 @@
" <td>2</td>\n", " <td>2</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a...</td>\n", " <td>@mahanna196 Da die Stadt keine Ausnahme für Ra...</td>\n",
" <td>2020-10-27 09:29:13</td>\n", " <td>2020-10-27 09:29:13</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -184,7 +192,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@mahanna196 Ja. *sr</td>\n", " <td>@mahanna196 Ja. *sr</td>\n",
" <td>2020-10-27 10:35:38</td>\n", " <td>2020-10-27 10:35:38</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
@ -199,8 +207,8 @@
" <td>3</td>\n", " <td>3</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F...</td>\n", " <td>#Aktuell Auf dem ehem. Bundeswehrkrankenhausge...</td>\n",
" <td>2020-10-27 12:36:26</td>\n", " <td>2020-10-27 12:36:26</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -214,8 +222,8 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr</td>\n", " <td>@Emma36166433 Bitte lesen Sie unseren Tweet 2/...</td>\n",
" <td>2020-10-27 12:59:06</td>\n", " <td>2020-10-27 12:59:06</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -229,8 +237,8 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>NaN</td>\n",
" <td>In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be...</td>\n", " <td>In der vergangenen Woche wurde die Wohnung des...</td>\n",
" <td>2020-10-27 13:57:32</td>\n", " <td>2020-10-27 13:57:32</td>\n",
" <td>778895426007203840</td>\n", " <td>778895426007203840</td>\n",
" <td>polizei_ol</td>\n", " <td>polizei_ol</td>\n",
@ -260,7 +268,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-19 13:40:36</td>\n", " <td>2023-02-19 13:40:36</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze...</td>\n", " <td>#Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...</td>\n",
" <td>2023-02-15 12:06:07</td>\n", " <td>2023-02-15 12:06:07</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -275,7 +283,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-25 13:14:49</td>\n", " <td>2023-02-25 13:14:49</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp;amp; Drogen + ...</td>\n", " <td>Unser Präventionsteam vom #A44 berät heute und...</td>\n",
" <td>2023-02-21 12:10:00</td>\n", " <td>2023-02-21 12:10:00</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -290,7 +298,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-25 13:14:49</td>\n", " <td>2023-02-25 13:14:49</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp;amp; Drogen im Straßenverkehr beraten...</td>\n", " <td>Auch unser #A52 war heute aktiv und hat zum Th...</td>\n",
" <td>2023-02-21 12:12:48</td>\n", " <td>2023-02-21 12:12:48</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -305,7 +313,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-26 13:15:05</td>\n", " <td>2023-02-26 13:15:05</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb...</td>\n", " <td>Gestern führte unser #A13 in einer Wohnsiedlun...</td>\n",
" <td>2023-02-22 11:15:58</td>\n", " <td>2023-02-22 11:15:58</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -320,7 +328,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>2023-02-27 12:17:33</td>\n", " <td>2023-02-27 12:17:33</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk...</td>\n", " <td>Auf dem Gelände der @BUFAStudios (Oberlandstr....</td>\n",
" <td>2023-02-23 10:53:07</td>\n", " <td>2023-02-23 10:53:07</td>\n",
" <td>1168873095614160896</td>\n", " <td>1168873095614160896</td>\n",
" <td>polizeiberlin_p</td>\n", " <td>polizeiberlin_p</td>\n",
@ -346,11 +354,11 @@
"151689 1628709531998998529 10 1 0 \n", "151689 1628709531998998529 10 1 0 \n",
"\n", "\n",
" quote_count measured_at is_deleted \\\n", " quote_count measured_at is_deleted \\\n",
"0 0 NaT <NA> \n", "0 0 NaT NaN \n",
"1 0 NaT <NA> \n", "1 0 NaT NaN \n",
"2 0 NaT <NA> \n", "2 0 NaT NaN \n",
"3 0 NaT <NA> \n", "3 0 NaT NaN \n",
"4 0 NaT <NA> \n", "4 0 NaT NaN \n",
"... ... ... ... \n", "... ... ... ... \n",
"151685 0 2023-02-19 13:40:36 False \n", "151685 0 2023-02-19 13:40:36 False \n",
"151686 0 2023-02-25 13:14:49 False \n", "151686 0 2023-02-25 13:14:49 False \n",
@ -358,31 +366,31 @@
"151688 0 2023-02-26 13:15:05 False \n", "151688 0 2023-02-26 13:15:05 False \n",
"151689 0 2023-02-27 12:17:33 False \n", "151689 0 2023-02-27 12:17:33 False \n",
"\n", "\n",
" tweet_text \\\n", " tweet_text created_at \\\n",
"0 @mahanna196 Da die Stadt keine Ausnahme für Radfahrer aufgeführt hat, gilt diese (Stand jetzt) a... \n", "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n",
"1 @mahanna196 Ja. *sr \n", "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n",
"2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausgelände in #Rostrup wurde ein Sprengsatz gefunden. F... \n", "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n",
"3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/2 *sr \n", "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n",
"4 In der vergangenen Woche wurde die Wohnung des Tatverdächtigen durchsucht. Dabei stellten die Be... \n", "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n",
"... ... \n", "... ... ... \n",
"151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 hat zu diesem Thema wieder einmal die Puppen tanze... \n", "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n",
"151686 Unser Präventionsteam vom #A44 berät heute und morgen tagsüber zum Thema Alkohol &amp; Drogen + ... \n", "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n",
"151687 Auch unser #A52 war heute aktiv und hat zum Thema Alkohol &amp; Drogen im Straßenverkehr beraten... \n", "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n",
"151688 Gestern führte unser #A13 in einer Wohnsiedlung einen Präventionseinsatz zum Thema „Wohnraumeinb... \n", "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n",
"151689 Auf dem Gelände der @BUFAStudios (Oberlandstr. 26-35) findet heute die #Seniorenmesse vom Bezirk... \n", "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n",
"\n", "\n",
" created_at user_id handle \\\n", " user_id handle \\\n",
"0 2020-10-27 09:29:13 778895426007203840 polizei_ol \n", "0 778895426007203840 polizei_ol \n",
"1 2020-10-27 10:35:38 778895426007203840 polizei_ol \n", "1 778895426007203840 polizei_ol \n",
"2 2020-10-27 12:36:26 778895426007203840 polizei_ol \n", "2 778895426007203840 polizei_ol \n",
"3 2020-10-27 12:59:06 778895426007203840 polizei_ol \n", "3 778895426007203840 polizei_ol \n",
"4 2020-10-27 13:57:32 778895426007203840 polizei_ol \n", "4 778895426007203840 polizei_ol \n",
"... ... ... ... \n", "... ... ... \n",
"151685 2023-02-15 12:06:07 1168873095614160896 polizeiberlin_p \n", "151685 1168873095614160896 polizeiberlin_p \n",
"151686 2023-02-21 12:10:00 1168873095614160896 polizeiberlin_p \n", "151686 1168873095614160896 polizeiberlin_p \n",
"151687 2023-02-21 12:12:48 1168873095614160896 polizeiberlin_p \n", "151687 1168873095614160896 polizeiberlin_p \n",
"151688 2023-02-22 11:15:58 1168873095614160896 polizeiberlin_p \n", "151688 1168873095614160896 polizeiberlin_p \n",
"151689 2023-02-23 10:53:07 1168873095614160896 polizeiberlin_p \n", "151689 1168873095614160896 polizeiberlin_p \n",
"\n", "\n",
" user_name \n", " user_name \n",
"0 Polizei Oldenburg-Stadt/Ammerland \n", "0 Polizei Oldenburg-Stadt/Ammerland \n",
@ -400,19 +408,19 @@
"[151690 rows x 12 columns]" "[151690 rows x 12 columns]"
] ]
}, },
"execution_count": 49, "execution_count": 119,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Convert datatypes to appropriate one\n", "# Convert Counts to integer values\n",
"tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)\n", "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n",
"tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n",
" created_at = pd.to_datetime(tweets_combined['created_at']),\n", " created_at = pd.to_datetime(tweets_combined['created_at']),\n",
" handle = tweets_combined['handle'].str.lower(), # handle to lower case\n", " handle = tweets_combined['handle'].str.lower(),\n",
" is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable\n", " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n",
"tweets_combined#.to_csv(\"data/tweets_all_combined.csv\")" "tweets_combined"
] ]
}, },
{ {
@ -464,7 +472,7 @@
"source": [ "source": [
"## Metadaten \n", "## Metadaten \n",
"\n", "\n",
"Welche Daten bilden die Grundlage?\n" "Welche Daten bilden die Grundlage?"
] ]
}, },
{ {
@ -529,7 +537,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 114,
"id": "4f1e8c6c-3610-436e-899e-4d0307259230", "id": "4f1e8c6c-3610-436e-899e-4d0307259230",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -544,7 +552,7 @@
} }
], ],
"source": [ "source": [
"print(\"Die Tweets wurden vom\", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n",
"# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag"
] ]
}, },
@ -560,7 +568,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 122,
"id": "9373552e-6baf-46df-ae16-c63603e20a83", "id": "9373552e-6baf-46df-ae16-c63603e20a83",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -673,7 +681,7 @@
"61 Hamburg 53.550341 10.000654 " "61 Hamburg 53.550341 10.000654 "
] ]
}, },
"execution_count": 11, "execution_count": 122,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -702,23 +710,31 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 123,
"id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f", "id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n",
" for col_name, dtype in df.dtypes.iteritems():\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
"\n", "\n",
"<div id=\"altair-viz-7b78525a62b243eca7b1f4044a328f47\"></div>\n", "<div id=\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\"></div>\n",
"<script type=\"text/javascript\">\n", "<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n", " var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n", " (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n", " let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-7b78525a62b243eca7b1f4044a328f47\") {\n", " if (outputDiv.id !== \"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\") {\n",
" outputDiv = document.getElementById(\"altair-viz-7b78525a62b243eca7b1f4044a328f47\");\n", " outputDiv = document.getElementById(\"altair-viz-c1c17c98428f4353a3eca9bd87ef6517\");\n",
" }\n", " }\n",
" const paths = {\n", " const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
@ -764,14 +780,14 @@
" .catch(showError)\n", " .catch(showError)\n",
" .then(() => displayChart(vegaEmbed));\n", " .then(() => displayChart(vegaEmbed));\n",
" }\n", " }\n",
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"ordinal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n", " })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-59538db49feb940cb722f8834432bfab\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"field\": \"count\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"handle\", \"sort\": \"-x\", \"type\": \"nominal\"}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-59538db49feb940cb722f8834432bfab\": [{\"handle\": \"polizei_ffm\", \"count\": 5512, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeisachsen\", \"count\": 5340, \"Name\": \"Polizei Sachsen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen\", \"Stadt\": \"Dresden\", \"LAT\": \"51.0493286\", \"LONG\": \"13.7381437\"}, {\"handle\": \"polizei_nrw_do\", \"count\": 4895, \"Name\": \"Polizei NRW DO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Dortmund\", \"LAT\": \"51.5142273\", \"LONG\": \"7.4652789\"}, {\"handle\": \"polizeibb\", \"count\": 4323, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizeihamburg\", \"count\": 4042, \"Name\": \"Polizei Hamburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Hamburg\", \"Stadt\": \"Hamburg\", \"LAT\": \"53.550341\", \"LONG\": \"10.000654\"}, {\"handle\": \"polizeimuenchen\", \"count\": 3951, \"Name\": \"Polizei M\\u00fcnchen\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"M\\u00fcnchen\", \"LAT\": \"48.135125\", \"LONG\": \"11.581981\"}, {\"handle\": \"polizeimfr\", \"count\": 3317, \"Name\": \"Polizei Mittelfranken\", \"Typ\": \"Polizei\", \"Bundesland\": \"Bayern\", \"Stadt\": \"N\\u00fcrnberg\", \"LAT\": \"49.453872\", \"LONG\": \"11.077298\"}, {\"handle\": \"polizeimannheim\", \"count\": 3300, \"Name\": \"Polizei Mannheim\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Mannheim\", \"LAT\": \"49.4892913\", \"LONG\": \"8.4673098\"}, {\"handle\": \"bremenpolizei\", \"count\": 2664, \"Name\": null, \"Typ\": null, \"Bundesland\": null, \"Stadt\": null, \"LAT\": null, \"LONG\": null}, {\"handle\": \"polizei_ka\", \"count\": 2568, \"Name\": \"Polizei Karlsruhe\", \"Typ\": \"Polizei\", \"Bundesland\": \"Baden-W\\u00fcrttemberg\", \"Stadt\": \"Karlsruhe\", \"LAT\": \"49.0068705\", \"LONG\": \"8.4034195\"}, {\"handle\": \"polizei_nrw_k\", \"count\": 2544, \"Name\": \"Polizei NRW K\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"K\\u00f6ln\", \"LAT\": \"50.938361\", \"LONG\": \"6.959974\"}, {\"handle\": \"polizei_nrw_bo\", \"count\": 2367, \"Name\": \"Polizei NRW BO\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bochum\", \"LAT\": \"51.4818111\", \"LONG\": \"7.2196635\"}, {\"handle\": \"polizei_md\", \"count\": 2319, \"Name\": \"Polizei Magdeburg\", \"Typ\": \"Polizei\", \"Bundesland\": \"Sachsen-Anhalt\", \"Stadt\": \"Magdeburg\", \"LAT\": \"52.1315889\", \"LONG\": \"11.6399609\"}, {\"handle\": \"polizei_h\", \"count\": 2302, \"Name\": \"Polizei Hannover\", \"Typ\": \"Polizei\", \"Bundesland\": \"Niedersachsen\", \"Stadt\": \"Hannover\", \"LAT\": \"52.3744779\", \"LONG\": \"9.7385532\"}, {\"handle\": \"polizei_nrw_bi\", \"count\": 2299, \"Name\": \"Polizei NRW BI\", \"Typ\": \"Polizei\", \"Bundesland\": \"Nordrhein-Westfalen\", \"Stadt\": \"Bielefeld\", \"LAT\": \"52.0191005\", \"LONG\": \"8.531007\"}]}}, {\"mode\": \"vega-lite\"});\n",
"</script>" "</script>"
], ],
"text/plain": [ "text/plain": [
"alt.Chart(...)" "alt.Chart(...)"
] ]
}, },
"execution_count": 13, "execution_count": 123,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -779,7 +795,7 @@
"source": [ "source": [
"barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n", "barchart = alt.Chart(activy_police_vis[0:15]).mark_bar().encode(\n",
" x = 'count:Q',\n", " x = 'count:Q',\n",
" y = alt.Y('handle:O', sort = '-x'),\n", " y = alt.Y('handle:N', sort = '-x'),\n",
")\n", ")\n",
"barchart " "barchart "
] ]
@ -796,7 +812,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 125,
"id": "d0549250-b11f-4762-8500-1134c53303b4", "id": "d0549250-b11f-4762-8500-1134c53303b4",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -804,377 +820,22 @@
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>tweet_id</th>\n",
" <th>like_count</th>\n",
" <th>retweet_count</th>\n",
" <th>reply_count</th>\n",
" <th>quote_count</th>\n",
" <th>measured_at</th>\n",
" <th>is_deleted</th>\n",
" <th>tweet_text</th>\n",
" <th>created_at</th>\n",
" <th>user_id</th>\n",
" <th>handle</th>\n",
" <th>user_name</th>\n",
" <th>Name</th>\n",
" <th>Typ</th>\n",
" <th>Bundesland</th>\n",
" <th>Stadt</th>\n",
" <th>LAT</th>\n",
" <th>LONG</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3053</td>\n",
" <td>1609539240458878979</td>\n",
" <td>21455</td>\n",
" <td>1845</td>\n",
" <td>3643</td>\n",
" <td>341</td>\n",
" <td>2023-01-05 14:44:34</td>\n",
" <td>False</td>\n",
" <td>Die Gewalt, die unsere Kolleginnen &amp;amp; Kollegen in der Silvesternacht erleben mussten, ist une...</td>\n",
" <td>2023-01-01 13:17:13</td>\n",
" <td>2397974054</td>\n",
" <td>polizeiberlin</td>\n",
" <td>Polizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1331</td>\n",
" <td>1355179228396879872</td>\n",
" <td>19186</td>\n",
" <td>3386</td>\n",
" <td>1203</td>\n",
" <td>628</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...</td>\n",
" <td>2021-01-29 15:41:20</td>\n",
" <td>2397974054</td>\n",
" <td>polizeiberlin</td>\n",
" <td>Polizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>91693</td>\n",
" <td>1505620459148173316</td>\n",
" <td>15708</td>\n",
" <td>7098</td>\n",
" <td>186</td>\n",
" <td>540</td>\n",
" <td>2022-03-24 20:15:08</td>\n",
" <td>False</td>\n",
" <td>WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...</td>\n",
" <td>2022-03-20 19:01:05</td>\n",
" <td>2389161066</td>\n",
" <td>polizei_nrw_bn</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Bonn</td>\n",
" <td>50.735851</td>\n",
" <td>7.10066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>91695</td>\n",
" <td>1505620666476896259</td>\n",
" <td>10337</td>\n",
" <td>1539</td>\n",
" <td>59</td>\n",
" <td>35</td>\n",
" <td>2022-03-24 20:15:08</td>\n",
" <td>False</td>\n",
" <td>Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...</td>\n",
" <td>2022-03-20 19:01:54</td>\n",
" <td>2389161066</td>\n",
" <td>polizei_nrw_bn</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei NRW BN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Bonn</td>\n",
" <td>50.735851</td>\n",
" <td>7.10066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>122631</td>\n",
" <td>1359098196434292739</td>\n",
" <td>9471</td>\n",
" <td>642</td>\n",
" <td>128</td>\n",
" <td>102</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2</td>\n",
" <td>2021-02-09 11:13:55</td>\n",
" <td>4876039738</td>\n",
" <td>bpol_b</td>\n",
" <td>Bundespolizei Berlin</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151685</th>\n",
" <td>7569</td>\n",
" <td>1332625325654757377</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...</td>\n",
" <td>2020-11-28 10:00:11</td>\n",
" <td>223758384</td>\n",
" <td>polizeisachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei</td>\n",
" <td>Sachsen</td>\n",
" <td>Dresden</td>\n",
" <td>51.0493286</td>\n",
" <td>13.7381437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151686</th>\n",
" <td>7572</td>\n",
" <td>1332738525507186692</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...</td>\n",
" <td>2020-11-28 17:30:00</td>\n",
" <td>223758384</td>\n",
" <td>polizeisachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei Sachsen</td>\n",
" <td>Polizei</td>\n",
" <td>Sachsen</td>\n",
" <td>Dresden</td>\n",
" <td>51.0493286</td>\n",
" <td>13.7381437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151687</th>\n",
" <td>144702</td>\n",
" <td>1465679768494526467</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...</td>\n",
" <td>2021-11-30 13:51:02</td>\n",
" <td>4876085224</td>\n",
" <td>bpol_nord</td>\n",
" <td>Bundespolizei Nord</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151688</th>\n",
" <td>144701</td>\n",
" <td>1464124290605977600</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>@gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...</td>\n",
" <td>2021-11-26 06:50:07</td>\n",
" <td>4876085224</td>\n",
" <td>bpol_nord</td>\n",
" <td>Bundespolizei Nord</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151689</th>\n",
" <td>66854</td>\n",
" <td>1376453040283209728</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>-99</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>#Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...</td>\n",
" <td>2021-03-29 08:35:52</td>\n",
" <td>2389263558</td>\n",
" <td>polizei_nrw_un</td>\n",
" <td>Polizei NRW UN</td>\n",
" <td>Polizei NRW UN</td>\n",
" <td>Polizei</td>\n",
" <td>Nordrhein-Westfalen</td>\n",
" <td>Unna</td>\n",
" <td>51.5348835</td>\n",
" <td>7.689014</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>151690 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
" index tweet_id like_count retweet_count reply_count \\\n", "0 Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une...\n",
"0 3053 1609539240458878979 21455 1845 3643 \n", "1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n",
"1 1331 1355179228396879872 19186 3386 1203 \n", "2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n",
"2 91693 1505620459148173316 15708 7098 186 \n", "3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n",
"3 91695 1505620666476896259 10337 1539 59 \n", "4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n",
"4 122631 1359098196434292739 9471 642 128 \n", " ... \n",
"... ... ... ... ... ... \n", "151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n",
"151685 7569 1332625325654757377 -99 -99 -99 \n", "151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n",
"151686 7572 1332738525507186692 -99 -99 -99 \n", "151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n",
"151687 144702 1465679768494526467 -99 -99 -99 \n", "151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n",
"151688 144701 1464124290605977600 -99 -99 -99 \n", "151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n",
"151689 66854 1376453040283209728 -99 -99 -99 \n", "Name: tweet_text, Length: 151690, dtype: object"
"\n",
" quote_count measured_at is_deleted \\\n",
"0 341 2023-01-05 14:44:34 False \n",
"1 628 NaT NaN \n",
"2 540 2022-03-24 20:15:08 False \n",
"3 35 2022-03-24 20:15:08 False \n",
"4 102 NaT NaN \n",
"... ... ... ... \n",
"151685 -99 NaT NaN \n",
"151686 -99 NaT NaN \n",
"151687 -99 NaT NaN \n",
"151688 -99 NaT NaN \n",
"151689 -99 NaT NaN \n",
"\n",
" tweet_text \\\n",
"0 Die Gewalt, die unsere Kolleginnen &amp; Kollegen in der Silvesternacht erleben mussten, ist une... \n",
"1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T... \n",
"2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a... \n",
"3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da... \n",
"4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2 \n",
"... ... \n",
"151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ... \n",
"151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ... \n",
"151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV... \n",
"151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt... \n",
"151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F... \n",
"\n",
" created_at user_id handle user_name \\\n",
"0 2023-01-01 13:17:13 2397974054 polizeiberlin Polizei Berlin \n",
"1 2021-01-29 15:41:20 2397974054 polizeiberlin Polizei Berlin \n",
"2 2022-03-20 19:01:05 2389161066 polizei_nrw_bn Polizei NRW BN \n",
"3 2022-03-20 19:01:54 2389161066 polizei_nrw_bn Polizei NRW BN \n",
"4 2021-02-09 11:13:55 4876039738 bpol_b Bundespolizei Berlin \n",
"... ... ... ... ... \n",
"151685 2020-11-28 10:00:11 223758384 polizeisachsen Polizei Sachsen \n",
"151686 2020-11-28 17:30:00 223758384 polizeisachsen Polizei Sachsen \n",
"151687 2021-11-30 13:51:02 4876085224 bpol_nord Bundespolizei Nord \n",
"151688 2021-11-26 06:50:07 4876085224 bpol_nord Bundespolizei Nord \n",
"151689 2021-03-29 08:35:52 2389263558 polizei_nrw_un Polizei NRW UN \n",
"\n",
" Name Typ Bundesland Stadt LAT \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
"3 Polizei NRW BN Polizei Nordrhein-Westfalen Bonn 50.735851 \n",
"4 NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... \n",
"151685 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
"151686 Polizei Sachsen Polizei Sachsen Dresden 51.0493286 \n",
"151687 NaN NaN NaN NaN NaN \n",
"151688 NaN NaN NaN NaN NaN \n",
"151689 Polizei NRW UN Polizei Nordrhein-Westfalen Unna 51.5348835 \n",
"\n",
" LONG \n",
"0 NaN \n",
"1 NaN \n",
"2 7.10066 \n",
"3 7.10066 \n",
"4 NaN \n",
"... ... \n",
"151685 13.7381437 \n",
"151686 13.7381437 \n",
"151687 NaN \n",
"151688 NaN \n",
"151689 7.689014 \n",
"\n",
"[151690 rows x 19 columns]"
] ]
}, },
"execution_count": 14, "execution_count": 125,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1184,14 +845,14 @@
" on = \"handle\",\n", " on = \"handle\",\n",
" how = \"left\")\n", " how = \"left\")\n",
"pd.options.display.max_colwidth = 100\n", "pd.options.display.max_colwidth = 100\n",
"tweets_attention.sort_values('like_count', ascending = False).reset_index()\n", "tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n",
"\n" "\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 42, "execution_count": 90,
"id": "621a3b74-e909-435c-8820-b38b63aa4893", "id": "97952234-7957-421e-bd2c-2c8261992c5a",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
@ -1311,12 +972,144 @@
"[11559 rows x 3 columns]" "[11559 rows x 3 columns]"
] ]
}, },
"execution_count": 42, "execution_count": 90,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [] "source": [
"old = pd.read_csv(\"data/user_old.tsv\",sep = \"\\t\").rename(columns = {\"id\":\"user_id\",\"name\": \"user_name\"} )\n",
"new = pd.read_csv(\"data/tweets-1679742702794.csv\").rename(columns = {\"username\":\"handle\", \"handle\": \"user_name\"})\n",
"new"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "ed86b45e-9dd8-436d-9c96-15500ed93985",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>count</th>\n",
" </tr>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>223758384</th>\n",
" <th>Polizei Sachsen</th>\n",
" <td>5340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259607457</th>\n",
" <th>Polizei NRW K</th>\n",
" <td>2544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>424895827</th>\n",
" <th>Polizei Stuttgart</th>\n",
" <td>1913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>769128278</th>\n",
" <th>Polizei NRW DO</th>\n",
" <td>4895</td>\n",
" </tr>\n",
" <tr>\n",
" <th>775664780</th>\n",
" <th>Polizei Rostock</th>\n",
" <td>604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1169206134189830145</th>\n",
" <th>Polizei Stendal</th>\n",
" <td>842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1184022676488314880</th>\n",
" <th>Polizei Pforzheim</th>\n",
" <td>283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1184024283342950401</th>\n",
" <th>Polizei Ravensburg</th>\n",
" <td>460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1232548941889228808</th>\n",
" <th>Systemstratege:</th>\n",
" <td>168</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1295978598034284546</th>\n",
" <th>Polizei ZPD NI</th>\n",
" <td>133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>163 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" count\n",
"user_id user_name \n",
"223758384 Polizei Sachsen 5340\n",
"259607457 Polizei NRW K 2544\n",
"424895827 Polizei Stuttgart 1913\n",
"769128278 Polizei NRW DO 4895\n",
"775664780 Polizei Rostock 604\n",
"... ...\n",
"1169206134189830145 Polizei Stendal 842\n",
"1184022676488314880 Polizei Pforzheim 283\n",
"1184024283342950401 Polizei Ravensburg 460\n",
"1232548941889228808 Systemstratege: 168\n",
"1295978598034284546 Polizei ZPD NI 133\n",
"\n",
"[163 rows x 1 columns]"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n",
" )[\"user_id\"].aggregate(['count']\n",
" )"
]
} }
], ],
"metadata": { "metadata": {