From abe05ce248d72f9d7027dba1972c8b163a771630 Mon Sep 17 00:00:00 2001 From: Peter Kannewitz Date: Mon, 27 Mar 2023 21:30:05 +0200 Subject: [PATCH] unifying handles and usernames --- .gitignore | 1 + .../zusammenfassung-checkpoint.ipynb | 682 ++++++++++-------- .../default-37a8.jupyterlab-workspace | 2 +- zusammenfassung.ipynb | 682 ++++++++++-------- 4 files changed, 790 insertions(+), 577 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f5c53959 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.jupyter/ diff --git a/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb b/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb index 3c116a4b..65a91217 100644 --- a/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb +++ b/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2", "metadata": { "tags": [] @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 117, "id": "fcc48831-7999-4d79-b722-736715b1ced6", "metadata": { "tags": [] @@ -46,10 +46,10 @@ { "data": { "text/plain": [ - "((479991, 3), (151690, 8), (151690, 4), (13327, 5))" + "((479991, 3), (151690, 8), (151690, 4), (13327, 3))" ] }, - "execution_count": 119, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -79,6 +79,14 @@ " how = \"outer\",\n", " suffixes = [\"_2021\", \"_2022\"])\n", "\n", + "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n", + "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n", + " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n", + " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n", + "\n", + "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", + " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", + "\n", "tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape" ] }, @@ -92,56 +100,39 @@ }, { "cell_type": "code", - "execution_count": 150, - "id": "cf409591-74a0-48dc-8f9e-66f7229f58cd", + "execution_count": 118, + "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Merge like statistics, tweet text and user information in one data frame\n", + "tweets_combined = pd.merge(tweets_statistics, \n", + " tweets_text,\n", + " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", + " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "text/plain": [ - "tweet_id int64\n", - "like_count int64\n", - "retweet_count int64\n", - "reply_count int64\n", - "quote_count int64\n", - "measured_at object\n", - "is_deleted float64\n", - "tweet_text object\n", - "created_at object\n", - "user_id int64\n", - "user_name_2021 object\n", - "handle_2021 object\n", - "handle_2022 object\n", - "user_name_2022 object\n", - "dtype: object" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tweets_combined = pd.merge(tweets_statistics, \n", - " tweets_text,\n", - " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", - " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", - " \n", - "# Convert Counts to integer values\n", - "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", - "tweets_combined.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "e312a975-3921-44ee-a7c5-37736678bc3f", - "metadata": { - "tags": [] - }, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " output = repr(obj)\n", + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " return method()\n" + ] + }, { "data": { "text/html": [ @@ -163,124 +154,273 @@ " \n", " \n", " \n", + " tweet_id\n", + " like_count\n", + " retweet_count\n", + " reply_count\n", + " quote_count\n", + " measured_at\n", + " is_deleted\n", + " tweet_text\n", + " created_at\n", " user_id\n", " handle\n", - " username\n", + " user_name\n", " \n", " \n", " \n", " \n", " 0\n", - " 1000004686156652545\n", - " 6jannik9\n", - " Systemstratege:\n", + " 1321021123463663616\n", + " 2\n", + " 1\n", + " 2\n", + " 0\n", + " NaT\n", + " NaN\n", + " @mahanna196 Da die Stadt keine Ausnahme für Ra...\n", + " 2020-10-27 09:29:13\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 1\n", - " 1000043230870867969\n", - " lsollik\n", - " Physiolucy\n", + " 1321037834246066181\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " @mahanna196 Ja. *sr\n", + " 2020-10-27 10:35:38\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 2\n", - " 1000405847460151296\n", - " achim1949hans\n", - " Systemstratege:\n", + " 1321068234955776000\n", + " 19\n", + " 3\n", + " 3\n", + " 0\n", + " NaT\n", + " NaN\n", + " #Aktuell Auf dem ehem. Bundeswehrkrankenhausge...\n", + " 2020-10-27 12:36:26\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 3\n", - " 1000460805719121921\n", - " wahrew\n", - " WahreWorte\n", + " 1321073940199100416\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " @Emma36166433 Bitte lesen Sie unseren Tweet 2/...\n", + " 2020-10-27 12:59:06\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 4\n", - " 1000744009638252544\n", - " derd1ck3\n", - " Ⓓ①ⓒⓚ①③ (🏡)\n", + " 1321088646506754049\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " In der vergangenen Woche wurde die Wohnung des...\n", + " 2020-10-27 13:57:32\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 11554\n", - " 99931264\n", - " havok1975\n", - " Systemstratege:\n", + " 151685\n", + " 1625828803804004354\n", + " 5\n", + " 1\n", + " 1\n", + " 0\n", + " 2023-02-19 13:40:36\n", + " False\n", + " #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...\n", + " 2023-02-15 12:06:07\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11555\n", - " 999542638226403328\n", - " madame_de_saxe\n", - " Systemstratege:\n", + " 151686\n", + " 1628004105623900167\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-25 13:14:49\n", + " False\n", + " Unser Präventionsteam vom #A44 berät heute und...\n", + " 2023-02-21 12:10:00\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11556\n", - " 999901133282754560\n", - " tungstendie74\n", - " Systemstratege:\n", + " 151687\n", + " 1628004810183016448\n", + " 6\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-25 13:14:49\n", + " False\n", + " Auch unser #A52 war heute aktiv und hat zum Th...\n", + " 2023-02-21 12:12:48\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11557\n", - " 999904275080794112\n", - " _danielheim\n", - " Systemstratege:\n", + " 151688\n", + " 1628352896352878593\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-26 13:15:05\n", + " False\n", + " Gestern führte unser #A13 in einer Wohnsiedlun...\n", + " 2023-02-22 11:15:58\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11558\n", - " 999955376454930432\n", - " amyman6010\n", - " Systemstratege:\n", + " 151689\n", + " 1628709531998998529\n", + " 10\n", + " 1\n", + " 0\n", + " 0\n", + " 2023-02-27 12:17:33\n", + " False\n", + " Auf dem Gelände der @BUFAStudios (Oberlandstr....\n", + " 2023-02-23 10:53:07\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", "\n", - "

11559 rows × 3 columns

\n", + "

151690 rows × 12 columns

\n", "" ], "text/plain": [ - " user_id handle username\n", - "0 1000004686156652545 6jannik9 Systemstratege: \n", - "1 1000043230870867969 lsollik Physiolucy\n", - "2 1000405847460151296 achim1949hans Systemstratege: \n", - "3 1000460805719121921 wahrew WahreWorte\n", - "4 1000744009638252544 derd1ck3 Ⓓ①ⓒⓚ①③ (🏡)\n", - "... ... ... ...\n", - "11554 99931264 havok1975 Systemstratege: \n", - "11555 999542638226403328 madame_de_saxe Systemstratege: \n", - "11556 999901133282754560 tungstendie74 Systemstratege: \n", - "11557 999904275080794112 _danielheim Systemstratege: \n", - "11558 999955376454930432 amyman6010 Systemstratege: \n", + " tweet_id like_count retweet_count reply_count \\\n", + "0 1321021123463663616 2 1 2 \n", + "1 1321037834246066181 2 0 0 \n", + "2 1321068234955776000 19 3 3 \n", + "3 1321073940199100416 0 0 0 \n", + "4 1321088646506754049 2 0 0 \n", + "... ... ... ... ... \n", + "151685 1625828803804004354 5 1 1 \n", + "151686 1628004105623900167 2 0 0 \n", + "151687 1628004810183016448 6 0 0 \n", + "151688 1628352896352878593 2 0 0 \n", + "151689 1628709531998998529 10 1 0 \n", "\n", - "[11559 rows x 3 columns]" + " quote_count measured_at is_deleted \\\n", + "0 0 NaT NaN \n", + "1 0 NaT NaN \n", + "2 0 NaT NaN \n", + "3 0 NaT NaN \n", + "4 0 NaT NaN \n", + "... ... ... ... \n", + "151685 0 2023-02-19 13:40:36 False \n", + "151686 0 2023-02-25 13:14:49 False \n", + "151687 0 2023-02-25 13:14:49 False \n", + "151688 0 2023-02-26 13:15:05 False \n", + "151689 0 2023-02-27 12:17:33 False \n", + "\n", + " tweet_text created_at \\\n", + "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n", + "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n", + "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n", + "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n", + "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n", + "... ... ... \n", + "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n", + "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n", + "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n", + "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n", + "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n", + "\n", + " user_id handle \\\n", + "0 778895426007203840 polizei_ol \n", + "1 778895426007203840 polizei_ol \n", + "2 778895426007203840 polizei_ol \n", + "3 778895426007203840 polizei_ol \n", + "4 778895426007203840 polizei_ol \n", + "... ... ... \n", + "151685 1168873095614160896 polizeiberlin_p \n", + "151686 1168873095614160896 polizeiberlin_p \n", + "151687 1168873095614160896 polizeiberlin_p \n", + "151688 1168873095614160896 polizeiberlin_p \n", + "151689 1168873095614160896 polizeiberlin_p \n", + "\n", + " user_name \n", + "0 Polizei Oldenburg-Stadt/Ammerland \n", + "1 Polizei Oldenburg-Stadt/Ammerland \n", + "2 Polizei Oldenburg-Stadt/Ammerland \n", + "3 Polizei Oldenburg-Stadt/Ammerland \n", + "4 Polizei Oldenburg-Stadt/Ammerland \n", + "... ... \n", + "151685 Polizei Berlin Prävention \n", + "151686 Polizei Berlin Prävention \n", + "151687 Polizei Berlin Prävention \n", + "151688 Polizei Berlin Prävention \n", + "151689 Polizei Berlin Prävention \n", + "\n", + "[151690 rows x 12 columns]" ] }, - "execution_count": 44, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_meta = pd.read_csv(\"data/tweets.csv\")\n", - "tweets_time = pd.read_csv(\"data/tweets-1679742620302.csv\")\n", - "tweets_text = pd.read_csv(\"data/tweets-1679742698645.csv\")\n", - "tweets_user = pd.read_csv(\"data/tweets-1679742702794.csv\"\n", - " ).rename(columns = {\"username\":\"handle\", # rename columns\n", - " \"handle\": \"username\"})\n", - "tweets_user = tweets_user.assign(handle = tweets_user['handle'].str.lower()) # convert handles to lower case\n", - "tweets_combined = pd.merge(tweets_time, # merge the two tweet related data frames\n", - " tweets_text, \n", - " how = 'inner', \n", - " on = 'tweet_id'\n", - " ).drop(['id'], # drop unascessary id column (redundant to index)\n", - " axis = 1)\n", + "# Convert Counts to integer values\n", + "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", - " created_at = pd.to_datetime(tweets_combined['created_at']))\n", - "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", - " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", - "tweets_user" + " created_at = pd.to_datetime(tweets_combined['created_at']),\n", + " handle = tweets_combined['handle'].str.lower(),\n", + " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n", + "tweets_combined" ] }, { @@ -337,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 112, "id": "0e5eb455-6b12-4572-8f5e-f328a94bd797", "metadata": { "tags": [] @@ -346,13 +486,13 @@ { "data": { "text/plain": [ - "hashtag 157145\n", - "url 88322\n", - "mention 36815\n", + "hashtag 267255\n", + "url 141594\n", + "mention 71142\n", "Name: entity_type, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -369,12 +509,12 @@ "tags": [] }, "source": [ - "Insgesamt haben wir 84794 einzigartige Tweets:" + "Insgesamt haben wir 151690 einzigartige Tweets:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 113, "id": "5a438e7f-8735-40bb-b450-2ce168f0f67a", "metadata": { "tags": [] @@ -383,10 +523,10 @@ { "data": { "text/plain": [ - "84794" + "151690" ] }, - "execution_count": 8, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 114, "id": "4f1e8c6c-3610-436e-899e-4d0307259230", "metadata": { "tags": [] @@ -407,12 +547,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Die Tweets wurden vom 2022-02-24 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 384 Tage.\n" + "Die Tweets wurden vom 2020-10-27 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 870 Tage. (Mit kleinen Unterbrechungen)\n" ] } ], "source": [ - "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage.\")\n", + "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" ] }, @@ -428,9 +568,11 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 122, "id": "9373552e-6baf-46df-ae16-c63603e20a83", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -467,7 +609,7 @@ " \n", " 11\n", " polizei_ffm\n", - " 2993\n", + " 5512\n", " NaN\n", " NaN\n", " NaN\n", @@ -476,20 +618,9 @@ " NaN\n", " \n", " \n", - " 3\n", - " polizei_nrw_do\n", - " 2860\n", - " Polizei NRW DO\n", - " Polizei\n", - " Nordrhein-Westfalen\n", - " Dortmund\n", - " 51.5142273\n", - " 7.4652789\n", - " \n", - " \n", " 0\n", " polizeisachsen\n", - " 2700\n", + " 5340\n", " Polizei Sachsen\n", " Polizei\n", " Sachsen\n", @@ -498,9 +629,20 @@ " 13.7381437\n", " \n", " \n", - " 91\n", + " 3\n", + " polizei_nrw_do\n", + " 4895\n", + " Polizei NRW DO\n", + " Polizei\n", + " Nordrhein-Westfalen\n", + " Dortmund\n", + " 51.5142273\n", + " 7.4652789\n", + " \n", + " \n", + " 92\n", " polizeibb\n", - " 2310\n", + " 4323\n", " NaN\n", " NaN\n", " NaN\n", @@ -511,7 +653,7 @@ " \n", " 61\n", " polizeihamburg\n", - " 2093\n", + " 4042\n", " Polizei Hamburg\n", " Polizei\n", " Hamburg\n", @@ -525,35 +667,32 @@ ], "text/plain": [ " handle count Name Typ Bundesland \\\n", - "11 polizei_ffm 2993 NaN NaN NaN \n", - "3 polizei_nrw_do 2860 Polizei NRW DO Polizei Nordrhein-Westfalen \n", - "0 polizeisachsen 2700 Polizei Sachsen Polizei Sachsen \n", - "91 polizeibb 2310 NaN NaN NaN \n", - "61 polizeihamburg 2093 Polizei Hamburg Polizei Hamburg \n", + "11 polizei_ffm 5512 NaN NaN NaN \n", + "0 polizeisachsen 5340 Polizei Sachsen Polizei Sachsen \n", + "3 polizei_nrw_do 4895 Polizei NRW DO Polizei Nordrhein-Westfalen \n", + "92 polizeibb 4323 NaN NaN NaN \n", + "61 polizeihamburg 4042 Polizei Hamburg Polizei Hamburg \n", "\n", " Stadt LAT LONG \n", "11 NaN NaN NaN \n", - "3 Dortmund 51.5142273 7.4652789 \n", "0 Dresden 51.0493286 13.7381437 \n", - "91 NaN NaN NaN \n", + "3 Dortmund 51.5142273 7.4652789 \n", + "92 NaN NaN NaN \n", "61 Hamburg 53.550341 10.000654 " ] }, - "execution_count": 43, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_agg = tweets_combined.merge(tweets_user,\n", - " on = \"user_id\"\n", - " ).groupby(by = [\"user_id\", \"handle\", \"username\"]\n", - " )[\"user_id\"].aggregate(['count']\n", - " ).merge(police_stations, \n", - " on = \"handle\",\n", - " how = \"left\"\n", - " ).sort_values(['count'], \n", - " ascending=False)\n", + "tweets_agg = tweets_combined.groupby(by = [\"user_id\", \"user_name\", \"handle\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " ).merge(police_stations,\n", + " on = \"handle\",\n", + " how = \"left\"\n", + " ).sort_values(['count'], ascending=False)\n", "tweets_agg.shape\n", "activy_police_vis = tweets_agg[0:50]\n", "activy_police_vis.head()" @@ -571,23 +710,31 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 123, "id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f", "metadata": { "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", + " for col_name, dtype in df.dtypes.iteritems():\n" + ] + }, { "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 47, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } @@ -665,7 +812,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 125, "id": "d0549250-b11f-4762-8500-1134c53303b4", "metadata": { "tags": [] @@ -674,32 +821,29 @@ { "data": { "text/plain": [ - "0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n", - "1 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n", - "2 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n", - "3 Auf unserem #A45 in #lichterfelde) befindet sich gerade diese Fundhündin. Sie wurde am Hindenbur...\n", - "4 @nexta_tv Wir haben das Video gesichert und leiten den Sachverhalt an die zuständigen Kolleginne...\n", - " ... \n", - "84789 #Polizeimeldungen #Tagesticker\\n \\nAnhalt-Bitterfeld\\nhttps://t.co/tNLEzztL1o\\n \\nDessau-Roßlau\\...\n", - "84790 Am Mittwoch erhielten wir mehrere Anrufe über einen auffälligen Pkw-Fahrer (Reifen quietschen un...\n", - "84791 @Jonas5Luisa Kleiner Pro-Tipp von uns: Einfach mal auf den link klicken! ;)*cl\n", - "84792 Vermisstensuche nach 27-Jährigem aus Bendorf-Mühlhofen: Wer hat Tobias Wißmann gesehen? Ein Foto...\n", - "84793 #PolizeiNRW #Köln #Leverkusen : XXX - Infos unter https://t.co/SeWShP2tZE https://t.co/Kopy7w8W3B\n", - "Name: tweet_text, Length: 84794, dtype: object" + "0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n", + "1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n", + "2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n", + "3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n", + "4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n", + " ... \n", + "151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n", + "151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n", + "151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n", + "151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n", + "151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n", + "Name: tweet_text, Length: 151690, dtype: object" ] }, - "execution_count": 90, + "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_attention = tweets_combined.merge(tweets_user,\n", - " on = \"user_id\",\n", - " how = \"left\"\n", - " ).merge(police_stations,\n", - " on = \"handle\",\n", - " how = \"left\")\n", + "tweets_attention = tweets_combined.merge(police_stations,\n", + " on = \"handle\",\n", + " how = \"left\")\n", "pd.options.display.max_colwidth = 100\n", "tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n", "\n" @@ -841,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 121, "id": "ed86b45e-9dd8-436d-9c96-15500ed93985", "metadata": { "tags": [] @@ -868,142 +1012,104 @@ " \n", " \n", " \n", - " like_count\n", - " retweet_count\n", - " reply_count\n", - " quote_count\n", + " \n", + " count\n", + " \n", + " \n", + " user_id\n", + " user_name\n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 2\n", - " 1\n", - " 2\n", - " 0\n", + " 223758384\n", + " Polizei Sachsen\n", + " 5340\n", " \n", " \n", - " 1\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 259607457\n", + " Polizei NRW K\n", + " 2544\n", " \n", " \n", - " 2\n", - " 19\n", - " 3\n", - " 3\n", - " 0\n", + " 424895827\n", + " Polizei Stuttgart\n", + " 1913\n", " \n", " \n", - " 3\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 769128278\n", + " Polizei NRW DO\n", + " 4895\n", " \n", " \n", - " 4\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 775664780\n", + " Polizei Rostock\n", + " 604\n", " \n", " \n", " ...\n", - " ...\n", - " ...\n", - " ...\n", + " ...\n", " ...\n", " \n", " \n", - " 151685\n", - " 5\n", - " 1\n", - " 1\n", - " 0\n", + " 1169206134189830145\n", + " Polizei Stendal\n", + " 842\n", " \n", " \n", - " 151686\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 1184022676488314880\n", + " Polizei Pforzheim\n", + " 283\n", " \n", " \n", - " 151687\n", - " 6\n", - " 0\n", - " 0\n", - " 0\n", + " 1184024283342950401\n", + " Polizei Ravensburg\n", + " 460\n", " \n", " \n", - " 151688\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 1232548941889228808\n", + " Systemstratege:\n", + " 168\n", " \n", " \n", - " 151689\n", - " 10\n", - " 1\n", - " 0\n", - " 0\n", + " 1295978598034284546\n", + " Polizei ZPD NI\n", + " 133\n", " \n", " \n", "\n", - "

151690 rows × 4 columns

\n", + "

163 rows × 1 columns

\n", "" ], "text/plain": [ - " like_count retweet_count reply_count quote_count\n", - "0 2 1 2 0\n", - "1 2 0 0 0\n", - "2 19 3 3 0\n", - "3 0 0 0 0\n", - "4 2 0 0 0\n", - "... ... ... ... ...\n", - "151685 5 1 1 0\n", - "151686 2 0 0 0\n", - "151687 6 0 0 0\n", - "151688 2 0 0 0\n", - "151689 10 1 0 0\n", + " count\n", + "user_id user_name \n", + "223758384 Polizei Sachsen 5340\n", + "259607457 Polizei NRW K 2544\n", + "424895827 Polizei Stuttgart 1913\n", + "769128278 Polizei NRW DO 4895\n", + "775664780 Polizei Rostock 604\n", + "... ...\n", + "1169206134189830145 Polizei Stendal 842\n", + "1184022676488314880 Polizei Pforzheim 283\n", + "1184024283342950401 Polizei Ravensburg 460\n", + "1232548941889228808 Systemstratege: 168\n", + "1295978598034284546 Polizei ZPD NI 133\n", "\n", - "[151690 rows x 4 columns]" + "[163 rows x 1 columns]" ] }, - "execution_count": 148, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "dac4e5fc-22ca-466d-bc3c-586e68696d03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "like_count\n", - "False 147573\n", - "True 4117\n", - "dtype: int64" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] + "source": [ + "tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " )" + ] } ], "metadata": { diff --git a/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace b/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace index 156c50d7..f1f80b52 100644 --- a/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace +++ b/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace @@ -1 +1 @@ -{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":0,"widgets":[]}},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.2676740420939018,0.7323259579060982,0]},"file-browser-filebrowser:cwd":{"path":"data"}},"metadata":{"id":"default"}} \ No newline at end of file +{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:zusammenfassung.ipynb"]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17943235504652827,0.8205676449534718,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}} \ No newline at end of file diff --git a/zusammenfassung.ipynb b/zusammenfassung.ipynb index 3c116a4b..65a91217 100644 --- a/zusammenfassung.ipynb +++ b/zusammenfassung.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2", "metadata": { "tags": [] @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 117, "id": "fcc48831-7999-4d79-b722-736715b1ced6", "metadata": { "tags": [] @@ -46,10 +46,10 @@ { "data": { "text/plain": [ - "((479991, 3), (151690, 8), (151690, 4), (13327, 5))" + "((479991, 3), (151690, 8), (151690, 4), (13327, 3))" ] }, - "execution_count": 119, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -79,6 +79,14 @@ " how = \"outer\",\n", " suffixes = [\"_2021\", \"_2022\"])\n", "\n", + "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n", + "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n", + " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n", + " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n", + "\n", + "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", + " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", + "\n", "tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape" ] }, @@ -92,56 +100,39 @@ }, { "cell_type": "code", - "execution_count": 150, - "id": "cf409591-74a0-48dc-8f9e-66f7229f58cd", + "execution_count": 118, + "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Merge like statistics, tweet text and user information in one data frame\n", + "tweets_combined = pd.merge(tweets_statistics, \n", + " tweets_text,\n", + " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", + " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "text/plain": [ - "tweet_id int64\n", - "like_count int64\n", - "retweet_count int64\n", - "reply_count int64\n", - "quote_count int64\n", - "measured_at object\n", - "is_deleted float64\n", - "tweet_text object\n", - "created_at object\n", - "user_id int64\n", - "user_name_2021 object\n", - "handle_2021 object\n", - "handle_2022 object\n", - "user_name_2022 object\n", - "dtype: object" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tweets_combined = pd.merge(tweets_statistics, \n", - " tweets_text,\n", - " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", - " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", - " \n", - "# Convert Counts to integer values\n", - "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", - "tweets_combined.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "e312a975-3921-44ee-a7c5-37736678bc3f", - "metadata": { - "tags": [] - }, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " output = repr(obj)\n", + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " return method()\n" + ] + }, { "data": { "text/html": [ @@ -163,124 +154,273 @@ " \n", " \n", " \n", + " tweet_id\n", + " like_count\n", + " retweet_count\n", + " reply_count\n", + " quote_count\n", + " measured_at\n", + " is_deleted\n", + " tweet_text\n", + " created_at\n", " user_id\n", " handle\n", - " username\n", + " user_name\n", " \n", " \n", " \n", " \n", " 0\n", - " 1000004686156652545\n", - " 6jannik9\n", - " Systemstratege:\n", + " 1321021123463663616\n", + " 2\n", + " 1\n", + " 2\n", + " 0\n", + " NaT\n", + " NaN\n", + " @mahanna196 Da die Stadt keine Ausnahme für Ra...\n", + " 2020-10-27 09:29:13\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 1\n", - " 1000043230870867969\n", - " lsollik\n", - " Physiolucy\n", + " 1321037834246066181\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " @mahanna196 Ja. *sr\n", + " 2020-10-27 10:35:38\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 2\n", - " 1000405847460151296\n", - " achim1949hans\n", - " Systemstratege:\n", + " 1321068234955776000\n", + " 19\n", + " 3\n", + " 3\n", + " 0\n", + " NaT\n", + " NaN\n", + " #Aktuell Auf dem ehem. Bundeswehrkrankenhausge...\n", + " 2020-10-27 12:36:26\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 3\n", - " 1000460805719121921\n", - " wahrew\n", - " WahreWorte\n", + " 1321073940199100416\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " @Emma36166433 Bitte lesen Sie unseren Tweet 2/...\n", + " 2020-10-27 12:59:06\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " 4\n", - " 1000744009638252544\n", - " derd1ck3\n", - " Ⓓ①ⓒⓚ①③ (🏡)\n", + " 1321088646506754049\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " NaT\n", + " NaN\n", + " In der vergangenen Woche wurde die Wohnung des...\n", + " 2020-10-27 13:57:32\n", + " 778895426007203840\n", + " polizei_ol\n", + " Polizei Oldenburg-Stadt/Ammerland\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 11554\n", - " 99931264\n", - " havok1975\n", - " Systemstratege:\n", + " 151685\n", + " 1625828803804004354\n", + " 5\n", + " 1\n", + " 1\n", + " 0\n", + " 2023-02-19 13:40:36\n", + " False\n", + " #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ...\n", + " 2023-02-15 12:06:07\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11555\n", - " 999542638226403328\n", - " madame_de_saxe\n", - " Systemstratege:\n", + " 151686\n", + " 1628004105623900167\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-25 13:14:49\n", + " False\n", + " Unser Präventionsteam vom #A44 berät heute und...\n", + " 2023-02-21 12:10:00\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11556\n", - " 999901133282754560\n", - " tungstendie74\n", - " Systemstratege:\n", + " 151687\n", + " 1628004810183016448\n", + " 6\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-25 13:14:49\n", + " False\n", + " Auch unser #A52 war heute aktiv und hat zum Th...\n", + " 2023-02-21 12:12:48\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11557\n", - " 999904275080794112\n", - " _danielheim\n", - " Systemstratege:\n", + " 151688\n", + " 1628352896352878593\n", + " 2\n", + " 0\n", + " 0\n", + " 0\n", + " 2023-02-26 13:15:05\n", + " False\n", + " Gestern führte unser #A13 in einer Wohnsiedlun...\n", + " 2023-02-22 11:15:58\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", - " 11558\n", - " 999955376454930432\n", - " amyman6010\n", - " Systemstratege:\n", + " 151689\n", + " 1628709531998998529\n", + " 10\n", + " 1\n", + " 0\n", + " 0\n", + " 2023-02-27 12:17:33\n", + " False\n", + " Auf dem Gelände der @BUFAStudios (Oberlandstr....\n", + " 2023-02-23 10:53:07\n", + " 1168873095614160896\n", + " polizeiberlin_p\n", + " Polizei Berlin Prävention\n", " \n", " \n", "\n", - "

11559 rows × 3 columns

\n", + "

151690 rows × 12 columns

\n", "" ], "text/plain": [ - " user_id handle username\n", - "0 1000004686156652545 6jannik9 Systemstratege: \n", - "1 1000043230870867969 lsollik Physiolucy\n", - "2 1000405847460151296 achim1949hans Systemstratege: \n", - "3 1000460805719121921 wahrew WahreWorte\n", - "4 1000744009638252544 derd1ck3 Ⓓ①ⓒⓚ①③ (🏡)\n", - "... ... ... ...\n", - "11554 99931264 havok1975 Systemstratege: \n", - "11555 999542638226403328 madame_de_saxe Systemstratege: \n", - "11556 999901133282754560 tungstendie74 Systemstratege: \n", - "11557 999904275080794112 _danielheim Systemstratege: \n", - "11558 999955376454930432 amyman6010 Systemstratege: \n", + " tweet_id like_count retweet_count reply_count \\\n", + "0 1321021123463663616 2 1 2 \n", + "1 1321037834246066181 2 0 0 \n", + "2 1321068234955776000 19 3 3 \n", + "3 1321073940199100416 0 0 0 \n", + "4 1321088646506754049 2 0 0 \n", + "... ... ... ... ... \n", + "151685 1625828803804004354 5 1 1 \n", + "151686 1628004105623900167 2 0 0 \n", + "151687 1628004810183016448 6 0 0 \n", + "151688 1628352896352878593 2 0 0 \n", + "151689 1628709531998998529 10 1 0 \n", "\n", - "[11559 rows x 3 columns]" + " quote_count measured_at is_deleted \\\n", + "0 0 NaT NaN \n", + "1 0 NaT NaN \n", + "2 0 NaT NaN \n", + "3 0 NaT NaN \n", + "4 0 NaT NaN \n", + "... ... ... ... \n", + "151685 0 2023-02-19 13:40:36 False \n", + "151686 0 2023-02-25 13:14:49 False \n", + "151687 0 2023-02-25 13:14:49 False \n", + "151688 0 2023-02-26 13:15:05 False \n", + "151689 0 2023-02-27 12:17:33 False \n", + "\n", + " tweet_text created_at \\\n", + "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n", + "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n", + "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n", + "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n", + "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n", + "... ... ... \n", + "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n", + "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n", + "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n", + "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n", + "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n", + "\n", + " user_id handle \\\n", + "0 778895426007203840 polizei_ol \n", + "1 778895426007203840 polizei_ol \n", + "2 778895426007203840 polizei_ol \n", + "3 778895426007203840 polizei_ol \n", + "4 778895426007203840 polizei_ol \n", + "... ... ... \n", + "151685 1168873095614160896 polizeiberlin_p \n", + "151686 1168873095614160896 polizeiberlin_p \n", + "151687 1168873095614160896 polizeiberlin_p \n", + "151688 1168873095614160896 polizeiberlin_p \n", + "151689 1168873095614160896 polizeiberlin_p \n", + "\n", + " user_name \n", + "0 Polizei Oldenburg-Stadt/Ammerland \n", + "1 Polizei Oldenburg-Stadt/Ammerland \n", + "2 Polizei Oldenburg-Stadt/Ammerland \n", + "3 Polizei Oldenburg-Stadt/Ammerland \n", + "4 Polizei Oldenburg-Stadt/Ammerland \n", + "... ... \n", + "151685 Polizei Berlin Prävention \n", + "151686 Polizei Berlin Prävention \n", + "151687 Polizei Berlin Prävention \n", + "151688 Polizei Berlin Prävention \n", + "151689 Polizei Berlin Prävention \n", + "\n", + "[151690 rows x 12 columns]" ] }, - "execution_count": 44, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_meta = pd.read_csv(\"data/tweets.csv\")\n", - "tweets_time = pd.read_csv(\"data/tweets-1679742620302.csv\")\n", - "tweets_text = pd.read_csv(\"data/tweets-1679742698645.csv\")\n", - "tweets_user = pd.read_csv(\"data/tweets-1679742702794.csv\"\n", - " ).rename(columns = {\"username\":\"handle\", # rename columns\n", - " \"handle\": \"username\"})\n", - "tweets_user = tweets_user.assign(handle = tweets_user['handle'].str.lower()) # convert handles to lower case\n", - "tweets_combined = pd.merge(tweets_time, # merge the two tweet related data frames\n", - " tweets_text, \n", - " how = 'inner', \n", - " on = 'tweet_id'\n", - " ).drop(['id'], # drop unascessary id column (redundant to index)\n", - " axis = 1)\n", + "# Convert Counts to integer values\n", + "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", - " created_at = pd.to_datetime(tweets_combined['created_at']))\n", - "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", - " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", - "tweets_user" + " created_at = pd.to_datetime(tweets_combined['created_at']),\n", + " handle = tweets_combined['handle'].str.lower(),\n", + " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n", + "tweets_combined" ] }, { @@ -337,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 112, "id": "0e5eb455-6b12-4572-8f5e-f328a94bd797", "metadata": { "tags": [] @@ -346,13 +486,13 @@ { "data": { "text/plain": [ - "hashtag 157145\n", - "url 88322\n", - "mention 36815\n", + "hashtag 267255\n", + "url 141594\n", + "mention 71142\n", "Name: entity_type, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -369,12 +509,12 @@ "tags": [] }, "source": [ - "Insgesamt haben wir 84794 einzigartige Tweets:" + "Insgesamt haben wir 151690 einzigartige Tweets:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 113, "id": "5a438e7f-8735-40bb-b450-2ce168f0f67a", "metadata": { "tags": [] @@ -383,10 +523,10 @@ { "data": { "text/plain": [ - "84794" + "151690" ] }, - "execution_count": 8, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 114, "id": "4f1e8c6c-3610-436e-899e-4d0307259230", "metadata": { "tags": [] @@ -407,12 +547,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Die Tweets wurden vom 2022-02-24 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 384 Tage.\n" + "Die Tweets wurden vom 2020-10-27 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 870 Tage. (Mit kleinen Unterbrechungen)\n" ] } ], "source": [ - "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage.\")\n", + "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" ] }, @@ -428,9 +568,11 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 122, "id": "9373552e-6baf-46df-ae16-c63603e20a83", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -467,7 +609,7 @@ " \n", " 11\n", " polizei_ffm\n", - " 2993\n", + " 5512\n", " NaN\n", " NaN\n", " NaN\n", @@ -476,20 +618,9 @@ " NaN\n", " \n", " \n", - " 3\n", - " polizei_nrw_do\n", - " 2860\n", - " Polizei NRW DO\n", - " Polizei\n", - " Nordrhein-Westfalen\n", - " Dortmund\n", - " 51.5142273\n", - " 7.4652789\n", - " \n", - " \n", " 0\n", " polizeisachsen\n", - " 2700\n", + " 5340\n", " Polizei Sachsen\n", " Polizei\n", " Sachsen\n", @@ -498,9 +629,20 @@ " 13.7381437\n", " \n", " \n", - " 91\n", + " 3\n", + " polizei_nrw_do\n", + " 4895\n", + " Polizei NRW DO\n", + " Polizei\n", + " Nordrhein-Westfalen\n", + " Dortmund\n", + " 51.5142273\n", + " 7.4652789\n", + " \n", + " \n", + " 92\n", " polizeibb\n", - " 2310\n", + " 4323\n", " NaN\n", " NaN\n", " NaN\n", @@ -511,7 +653,7 @@ " \n", " 61\n", " polizeihamburg\n", - " 2093\n", + " 4042\n", " Polizei Hamburg\n", " Polizei\n", " Hamburg\n", @@ -525,35 +667,32 @@ ], "text/plain": [ " handle count Name Typ Bundesland \\\n", - "11 polizei_ffm 2993 NaN NaN NaN \n", - "3 polizei_nrw_do 2860 Polizei NRW DO Polizei Nordrhein-Westfalen \n", - "0 polizeisachsen 2700 Polizei Sachsen Polizei Sachsen \n", - "91 polizeibb 2310 NaN NaN NaN \n", - "61 polizeihamburg 2093 Polizei Hamburg Polizei Hamburg \n", + "11 polizei_ffm 5512 NaN NaN NaN \n", + "0 polizeisachsen 5340 Polizei Sachsen Polizei Sachsen \n", + "3 polizei_nrw_do 4895 Polizei NRW DO Polizei Nordrhein-Westfalen \n", + "92 polizeibb 4323 NaN NaN NaN \n", + "61 polizeihamburg 4042 Polizei Hamburg Polizei Hamburg \n", "\n", " Stadt LAT LONG \n", "11 NaN NaN NaN \n", - "3 Dortmund 51.5142273 7.4652789 \n", "0 Dresden 51.0493286 13.7381437 \n", - "91 NaN NaN NaN \n", + "3 Dortmund 51.5142273 7.4652789 \n", + "92 NaN NaN NaN \n", "61 Hamburg 53.550341 10.000654 " ] }, - "execution_count": 43, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_agg = tweets_combined.merge(tweets_user,\n", - " on = \"user_id\"\n", - " ).groupby(by = [\"user_id\", \"handle\", \"username\"]\n", - " )[\"user_id\"].aggregate(['count']\n", - " ).merge(police_stations, \n", - " on = \"handle\",\n", - " how = \"left\"\n", - " ).sort_values(['count'], \n", - " ascending=False)\n", + "tweets_agg = tweets_combined.groupby(by = [\"user_id\", \"user_name\", \"handle\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " ).merge(police_stations,\n", + " on = \"handle\",\n", + " how = \"left\"\n", + " ).sort_values(['count'], ascending=False)\n", "tweets_agg.shape\n", "activy_police_vis = tweets_agg[0:50]\n", "activy_police_vis.head()" @@ -571,23 +710,31 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 123, "id": "b1c39196-d1cc-4f82-8e01-7529e7b3046f", "metadata": { "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", + " for col_name, dtype in df.dtypes.iteritems():\n" + ] + }, { "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 47, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } @@ -665,7 +812,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 125, "id": "d0549250-b11f-4762-8500-1134c53303b4", "metadata": { "tags": [] @@ -674,32 +821,29 @@ { "data": { "text/plain": [ - "0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n", - "1 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n", - "2 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n", - "3 Auf unserem #A45 in #lichterfelde) befindet sich gerade diese Fundhündin. Sie wurde am Hindenbur...\n", - "4 @nexta_tv Wir haben das Video gesichert und leiten den Sachverhalt an die zuständigen Kolleginne...\n", - " ... \n", - "84789 #Polizeimeldungen #Tagesticker\\n \\nAnhalt-Bitterfeld\\nhttps://t.co/tNLEzztL1o\\n \\nDessau-Roßlau\\...\n", - "84790 Am Mittwoch erhielten wir mehrere Anrufe über einen auffälligen Pkw-Fahrer (Reifen quietschen un...\n", - "84791 @Jonas5Luisa Kleiner Pro-Tipp von uns: Einfach mal auf den link klicken! ;)*cl\n", - "84792 Vermisstensuche nach 27-Jährigem aus Bendorf-Mühlhofen: Wer hat Tobias Wißmann gesehen? Ein Foto...\n", - "84793 #PolizeiNRW #Köln #Leverkusen : XXX - Infos unter https://t.co/SeWShP2tZE https://t.co/Kopy7w8W3B\n", - "Name: tweet_text, Length: 84794, dtype: object" + "0 Die Gewalt, die unsere Kolleginnen & Kollegen in der Silvesternacht erleben mussten, ist une...\n", + "1 An diejenigen, die vergangene Nacht in eine Schule in #Gesundbrunnen eingebrochen sind und 242 T...\n", + "2 WICHTIGE Info:\\nÜber das Internet wird derzeit ein Video verbreitet, in dem von einem Überfall a...\n", + "3 Die Experten gehen derzeit davon aus, dass es sich um ein absichtliches \"Fake-Video\" handelt, da...\n", + "4 Weil wir dich schieben! @BVG_Kampagne 😉 https://t.co/N8kdlCxhz2\n", + " ... \n", + "151685 Sinken die Temperaturen ❄, steigt zeitgleich das Risiko für Verkehrsteilnehmer. Höchste Zeit zu ...\n", + "151686 📺Am Sonntag, um 19:50 Uhr, geht es bei #KripoLive im \\n@mdrde\\n auch um die Fahndung nach einem ...\n", + "151687 Musik verbindet!\\nUnser #Adventskalender der #Bundespolizei startet morgen ➡ https://t.co/V6CaTV...\n", + "151688 @gretchen_hann Hallo, diese Frage kann die Bundespolizei Spezialkräfte besser beantworten. Richt...\n", + "151689 #Bönen #Holzwickede - Verstöße gegen Coronaschutzverordnung: Polizei löst Gaststättenabend und F...\n", + "Name: tweet_text, Length: 151690, dtype: object" ] }, - "execution_count": 90, + "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_attention = tweets_combined.merge(tweets_user,\n", - " on = \"user_id\",\n", - " how = \"left\"\n", - " ).merge(police_stations,\n", - " on = \"handle\",\n", - " how = \"left\")\n", + "tweets_attention = tweets_combined.merge(police_stations,\n", + " on = \"handle\",\n", + " how = \"left\")\n", "pd.options.display.max_colwidth = 100\n", "tweets_attention.sort_values('like_count', ascending = False).reset_index()['tweet_text']\n", "\n" @@ -841,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 121, "id": "ed86b45e-9dd8-436d-9c96-15500ed93985", "metadata": { "tags": [] @@ -868,142 +1012,104 @@ " \n", " \n", " \n", - " like_count\n", - " retweet_count\n", - " reply_count\n", - " quote_count\n", + " \n", + " count\n", + " \n", + " \n", + " user_id\n", + " user_name\n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 2\n", - " 1\n", - " 2\n", - " 0\n", + " 223758384\n", + " Polizei Sachsen\n", + " 5340\n", " \n", " \n", - " 1\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 259607457\n", + " Polizei NRW K\n", + " 2544\n", " \n", " \n", - " 2\n", - " 19\n", - " 3\n", - " 3\n", - " 0\n", + " 424895827\n", + " Polizei Stuttgart\n", + " 1913\n", " \n", " \n", - " 3\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 769128278\n", + " Polizei NRW DO\n", + " 4895\n", " \n", " \n", - " 4\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 775664780\n", + " Polizei Rostock\n", + " 604\n", " \n", " \n", " ...\n", - " ...\n", - " ...\n", - " ...\n", + " ...\n", " ...\n", " \n", " \n", - " 151685\n", - " 5\n", - " 1\n", - " 1\n", - " 0\n", + " 1169206134189830145\n", + " Polizei Stendal\n", + " 842\n", " \n", " \n", - " 151686\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 1184022676488314880\n", + " Polizei Pforzheim\n", + " 283\n", " \n", " \n", - " 151687\n", - " 6\n", - " 0\n", - " 0\n", - " 0\n", + " 1184024283342950401\n", + " Polizei Ravensburg\n", + " 460\n", " \n", " \n", - " 151688\n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", + " 1232548941889228808\n", + " Systemstratege:\n", + " 168\n", " \n", " \n", - " 151689\n", - " 10\n", - " 1\n", - " 0\n", - " 0\n", + " 1295978598034284546\n", + " Polizei ZPD NI\n", + " 133\n", " \n", " \n", "\n", - "

151690 rows × 4 columns

\n", + "

163 rows × 1 columns

\n", "" ], "text/plain": [ - " like_count retweet_count reply_count quote_count\n", - "0 2 1 2 0\n", - "1 2 0 0 0\n", - "2 19 3 3 0\n", - "3 0 0 0 0\n", - "4 2 0 0 0\n", - "... ... ... ... ...\n", - "151685 5 1 1 0\n", - "151686 2 0 0 0\n", - "151687 6 0 0 0\n", - "151688 2 0 0 0\n", - "151689 10 1 0 0\n", + " count\n", + "user_id user_name \n", + "223758384 Polizei Sachsen 5340\n", + "259607457 Polizei NRW K 2544\n", + "424895827 Polizei Stuttgart 1913\n", + "769128278 Polizei NRW DO 4895\n", + "775664780 Polizei Rostock 604\n", + "... ...\n", + "1169206134189830145 Polizei Stendal 842\n", + "1184022676488314880 Polizei Pforzheim 283\n", + "1184024283342950401 Polizei Ravensburg 460\n", + "1232548941889228808 Systemstratege: 168\n", + "1295978598034284546 Polizei ZPD NI 133\n", "\n", - "[151690 rows x 4 columns]" + "[163 rows x 1 columns]" ] }, - "execution_count": 148, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "dac4e5fc-22ca-466d-bc3c-586e68696d03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "like_count\n", - "False 147573\n", - "True 4117\n", - "dtype: int64" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] + "source": [ + "tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " )" + ] } ], "metadata": {