diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..f5c53959
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.jupyter/
diff --git a/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb b/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb
index 3c116a4b..65a91217 100644
--- a/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb
+++ b/.ipynb_checkpoints/zusammenfassung-checkpoint.ipynb
@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2",
"metadata": {
"tags": []
@@ -37,7 +37,7 @@
},
{
"cell_type": "code",
- "execution_count": 119,
+ "execution_count": 117,
"id": "fcc48831-7999-4d79-b722-736715b1ced6",
"metadata": {
"tags": []
@@ -46,10 +46,10 @@
{
"data": {
"text/plain": [
- "((479991, 3), (151690, 8), (151690, 4), (13327, 5))"
+ "((479991, 3), (151690, 8), (151690, 4), (13327, 3))"
]
},
- "execution_count": 119,
+ "execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
@@ -79,6 +79,14 @@
" how = \"outer\",\n",
" suffixes = [\"_2021\", \"_2022\"])\n",
"\n",
+ "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n",
+ "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n",
+ " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n",
+ " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n",
+ "\n",
+ "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n",
+ " ).rename(columns = {\"Polizei Account\": \"handle\"})\n",
+ "\n",
"tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape"
]
},
@@ -92,56 +100,39 @@
},
{
"cell_type": "code",
- "execution_count": 150,
- "id": "cf409591-74a0-48dc-8f9e-66f7229f58cd",
+ "execution_count": 118,
+ "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Merge like statistics, tweet text and user information in one data frame\n",
+ "tweets_combined = pd.merge(tweets_statistics, \n",
+ " tweets_text,\n",
+ " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
+ " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d",
"metadata": {
"tags": []
},
"outputs": [
{
- "data": {
- "text/plain": [
- "tweet_id int64\n",
- "like_count int64\n",
- "retweet_count int64\n",
- "reply_count int64\n",
- "quote_count int64\n",
- "measured_at object\n",
- "is_deleted float64\n",
- "tweet_text object\n",
- "created_at object\n",
- "user_id int64\n",
- "user_name_2021 object\n",
- "handle_2021 object\n",
- "handle_2022 object\n",
- "user_name_2022 object\n",
- "dtype: object"
- ]
- },
- "execution_count": 150,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tweets_combined = pd.merge(tweets_statistics, \n",
- " tweets_text,\n",
- " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n",
- " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n",
- " \n",
- "# Convert Counts to integer values\n",
- "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n",
- "tweets_combined.dtypes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "e312a975-3921-44ee-a7c5-37736678bc3f",
- "metadata": {
- "tags": []
- },
- "outputs": [
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
+ " output = repr(obj)\n",
+ "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n",
+ " return method()\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -163,124 +154,273 @@
" \n",
" \n",
" \n",
" \n",
" \n",
+ " tweet_id \n",
+ " like_count \n",
+ " retweet_count \n",
+ " reply_count \n",
+ " quote_count \n",
+ " measured_at \n",
+ " is_deleted \n",
+ " tweet_text \n",
+ " created_at \n",
" user_id \n",
" handle \n",
- " username \n",
+ " user_name \n",
"
11559 rows × 3 columns
\n", + "151690 rows × 12 columns
\n", "" ], "text/plain": [ - " user_id handle username\n", - "0 1000004686156652545 6jannik9 Systemstratege: \n", - "1 1000043230870867969 lsollik Physiolucy\n", - "2 1000405847460151296 achim1949hans Systemstratege: \n", - "3 1000460805719121921 wahrew WahreWorte\n", - "4 1000744009638252544 derd1ck3 Ⓓ①ⓒⓚ①③ (🏡)\n", - "... ... ... ...\n", - "11554 99931264 havok1975 Systemstratege: \n", - "11555 999542638226403328 madame_de_saxe Systemstratege: \n", - "11556 999901133282754560 tungstendie74 Systemstratege: \n", - "11557 999904275080794112 _danielheim Systemstratege: \n", - "11558 999955376454930432 amyman6010 Systemstratege: \n", + " tweet_id like_count retweet_count reply_count \\\n", + "0 1321021123463663616 2 1 2 \n", + "1 1321037834246066181 2 0 0 \n", + "2 1321068234955776000 19 3 3 \n", + "3 1321073940199100416 0 0 0 \n", + "4 1321088646506754049 2 0 0 \n", + "... ... ... ... ... \n", + "151685 1625828803804004354 5 1 1 \n", + "151686 1628004105623900167 2 0 0 \n", + "151687 1628004810183016448 6 0 0 \n", + "151688 1628352896352878593 2 0 0 \n", + "151689 1628709531998998529 10 1 0 \n", "\n", - "[11559 rows x 3 columns]" + " quote_count measured_at is_deleted \\\n", + "0 0 NaT NaN \n", + "1 0 NaT NaN \n", + "2 0 NaT NaN \n", + "3 0 NaT NaN \n", + "4 0 NaT NaN \n", + "... ... ... ... \n", + "151685 0 2023-02-19 13:40:36 False \n", + "151686 0 2023-02-25 13:14:49 False \n", + "151687 0 2023-02-25 13:14:49 False \n", + "151688 0 2023-02-26 13:15:05 False \n", + "151689 0 2023-02-27 12:17:33 False \n", + "\n", + " tweet_text created_at \\\n", + "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n", + "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n", + "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n", + "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n", + "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n", + "... ... ... \n", + "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n", + "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n", + "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n", + "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n", + "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n", + "\n", + " user_id handle \\\n", + "0 778895426007203840 polizei_ol \n", + "1 778895426007203840 polizei_ol \n", + "2 778895426007203840 polizei_ol \n", + "3 778895426007203840 polizei_ol \n", + "4 778895426007203840 polizei_ol \n", + "... ... ... \n", + "151685 1168873095614160896 polizeiberlin_p \n", + "151686 1168873095614160896 polizeiberlin_p \n", + "151687 1168873095614160896 polizeiberlin_p \n", + "151688 1168873095614160896 polizeiberlin_p \n", + "151689 1168873095614160896 polizeiberlin_p \n", + "\n", + " user_name \n", + "0 Polizei Oldenburg-Stadt/Ammerland \n", + "1 Polizei Oldenburg-Stadt/Ammerland \n", + "2 Polizei Oldenburg-Stadt/Ammerland \n", + "3 Polizei Oldenburg-Stadt/Ammerland \n", + "4 Polizei Oldenburg-Stadt/Ammerland \n", + "... ... \n", + "151685 Polizei Berlin Prävention \n", + "151686 Polizei Berlin Prävention \n", + "151687 Polizei Berlin Prävention \n", + "151688 Polizei Berlin Prävention \n", + "151689 Polizei Berlin Prävention \n", + "\n", + "[151690 rows x 12 columns]" ] }, - "execution_count": 44, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_meta = pd.read_csv(\"data/tweets.csv\")\n", - "tweets_time = pd.read_csv(\"data/tweets-1679742620302.csv\")\n", - "tweets_text = pd.read_csv(\"data/tweets-1679742698645.csv\")\n", - "tweets_user = pd.read_csv(\"data/tweets-1679742702794.csv\"\n", - " ).rename(columns = {\"username\":\"handle\", # rename columns\n", - " \"handle\": \"username\"})\n", - "tweets_user = tweets_user.assign(handle = tweets_user['handle'].str.lower()) # convert handles to lower case\n", - "tweets_combined = pd.merge(tweets_time, # merge the two tweet related data frames\n", - " tweets_text, \n", - " how = 'inner', \n", - " on = 'tweet_id'\n", - " ).drop(['id'], # drop unascessary id column (redundant to index)\n", - " axis = 1)\n", + "# Convert Counts to integer values\n", + "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", - " created_at = pd.to_datetime(tweets_combined['created_at']))\n", - "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", - " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", - "tweets_user" + " created_at = pd.to_datetime(tweets_combined['created_at']),\n", + " handle = tweets_combined['handle'].str.lower(),\n", + " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n", + "tweets_combined" ] }, { @@ -337,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 112, "id": "0e5eb455-6b12-4572-8f5e-f328a94bd797", "metadata": { "tags": [] @@ -346,13 +486,13 @@ { "data": { "text/plain": [ - "hashtag 157145\n", - "url 88322\n", - "mention 36815\n", + "hashtag 267255\n", + "url 141594\n", + "mention 71142\n", "Name: entity_type, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -369,12 +509,12 @@ "tags": [] }, "source": [ - "Insgesamt haben wir 84794 einzigartige Tweets:" + "Insgesamt haben wir 151690 einzigartige Tweets:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 113, "id": "5a438e7f-8735-40bb-b450-2ce168f0f67a", "metadata": { "tags": [] @@ -383,10 +523,10 @@ { "data": { "text/plain": [ - "84794" + "151690" ] }, - "execution_count": 8, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 114, "id": "4f1e8c6c-3610-436e-899e-4d0307259230", "metadata": { "tags": [] @@ -407,12 +547,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Die Tweets wurden vom 2022-02-24 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 384 Tage.\n" + "Die Tweets wurden vom 2020-10-27 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 870 Tage. (Mit kleinen Unterbrechungen)\n" ] } ], "source": [ - "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage.\")\n", + "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" ] }, @@ -428,9 +568,11 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 122, "id": "9373552e-6baf-46df-ae16-c63603e20a83", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -467,7 +609,7 @@ "151690 rows × 4 columns
\n", + "163 rows × 1 columns
\n", "" ], "text/plain": [ - " like_count retweet_count reply_count quote_count\n", - "0 2 1 2 0\n", - "1 2 0 0 0\n", - "2 19 3 3 0\n", - "3 0 0 0 0\n", - "4 2 0 0 0\n", - "... ... ... ... ...\n", - "151685 5 1 1 0\n", - "151686 2 0 0 0\n", - "151687 6 0 0 0\n", - "151688 2 0 0 0\n", - "151689 10 1 0 0\n", + " count\n", + "user_id user_name \n", + "223758384 Polizei Sachsen 5340\n", + "259607457 Polizei NRW K 2544\n", + "424895827 Polizei Stuttgart 1913\n", + "769128278 Polizei NRW DO 4895\n", + "775664780 Polizei Rostock 604\n", + "... ...\n", + "1169206134189830145 Polizei Stendal 842\n", + "1184022676488314880 Polizei Pforzheim 283\n", + "1184024283342950401 Polizei Ravensburg 460\n", + "1232548941889228808 Systemstratege: 168\n", + "1295978598034284546 Polizei ZPD NI 133\n", "\n", - "[151690 rows x 4 columns]" + "[163 rows x 1 columns]" ] }, - "execution_count": 148, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "dac4e5fc-22ca-466d-bc3c-586e68696d03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "like_count\n", - "False 147573\n", - "True 4117\n", - "dtype: int64" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] + "source": [ + "tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " )" + ] } ], "metadata": { diff --git a/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace b/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace index 156c50d7..f1f80b52 100644 --- a/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace +++ b/.jupyter/lab/workspaces/default-37a8.jupyterlab-workspace @@ -1 +1 @@ -{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":0,"widgets":[]}},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.2676740420939018,0.7323259579060982,0]},"file-browser-filebrowser:cwd":{"path":"data"}},"metadata":{"id":"default"}} \ No newline at end of file +{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:zusammenfassung.ipynb"]},"current":"notebook:zusammenfassung.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17943235504652827,0.8205676449534718,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:zusammenfassung.ipynb":{"data":{"path":"zusammenfassung.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}} \ No newline at end of file diff --git a/zusammenfassung.ipynb b/zusammenfassung.ipynb index 3c116a4b..65a91217 100644 --- a/zusammenfassung.ipynb +++ b/zusammenfassung.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9bd1686f-9bbc-4c05-a5f5-e0c4ce653fb2", "metadata": { "tags": [] @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 117, "id": "fcc48831-7999-4d79-b722-736715b1ced6", "metadata": { "tags": [] @@ -46,10 +46,10 @@ { "data": { "text/plain": [ - "((479991, 3), (151690, 8), (151690, 4), (13327, 5))" + "((479991, 3), (151690, 8), (151690, 4), (13327, 3))" ] }, - "execution_count": 119, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -79,6 +79,14 @@ " how = \"outer\",\n", " suffixes = [\"_2021\", \"_2022\"])\n", "\n", + "# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept\n", + "tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),\n", + " user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)\n", + " ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1)\n", + "\n", + "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", + " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", + "\n", "tweets_meta.shape, tweets_statistics.shape, tweets_text.shape, tweets_user.shape" ] }, @@ -92,56 +100,39 @@ }, { "cell_type": "code", - "execution_count": 150, - "id": "cf409591-74a0-48dc-8f9e-66f7229f58cd", + "execution_count": 118, + "id": "f30c2799-02c6-4e6a-ae36-9e039545b6b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Merge like statistics, tweet text and user information in one data frame\n", + "tweets_combined = pd.merge(tweets_statistics, \n", + " tweets_text,\n", + " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", + " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "bd407aba-eec1-41ed-bff9-4c5fcdf6cb9d", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "text/plain": [ - "tweet_id int64\n", - "like_count int64\n", - "retweet_count int64\n", - "reply_count int64\n", - "quote_count int64\n", - "measured_at object\n", - "is_deleted float64\n", - "tweet_text object\n", - "created_at object\n", - "user_id int64\n", - "user_name_2021 object\n", - "handle_2021 object\n", - "handle_2022 object\n", - "user_name_2022 object\n", - "dtype: object" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tweets_combined = pd.merge(tweets_statistics, \n", - " tweets_text,\n", - " on = 'tweet_id').merge(tweets_user, on = 'user_id'\n", - " ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)\n", - " \n", - "# Convert Counts to integer values\n", - "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", - "tweets_combined.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "e312a975-3921-44ee-a7c5-37736678bc3f", - "metadata": { - "tags": [] - }, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " output = repr(obj)\n", + "/nix/store/4105l1v2llsjz4j7qaqsz0fljc9z0z2r-python3-3.10.9-env/lib/python3.10/site-packages/IPython/core/formatters.py:342: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.\n", + " return method()\n" + ] + }, { "data": { "text/html": [ @@ -163,124 +154,273 @@ " \n", "11559 rows × 3 columns
\n", + "151690 rows × 12 columns
\n", "" ], "text/plain": [ - " user_id handle username\n", - "0 1000004686156652545 6jannik9 Systemstratege: \n", - "1 1000043230870867969 lsollik Physiolucy\n", - "2 1000405847460151296 achim1949hans Systemstratege: \n", - "3 1000460805719121921 wahrew WahreWorte\n", - "4 1000744009638252544 derd1ck3 Ⓓ①ⓒⓚ①③ (🏡)\n", - "... ... ... ...\n", - "11554 99931264 havok1975 Systemstratege: \n", - "11555 999542638226403328 madame_de_saxe Systemstratege: \n", - "11556 999901133282754560 tungstendie74 Systemstratege: \n", - "11557 999904275080794112 _danielheim Systemstratege: \n", - "11558 999955376454930432 amyman6010 Systemstratege: \n", + " tweet_id like_count retweet_count reply_count \\\n", + "0 1321021123463663616 2 1 2 \n", + "1 1321037834246066181 2 0 0 \n", + "2 1321068234955776000 19 3 3 \n", + "3 1321073940199100416 0 0 0 \n", + "4 1321088646506754049 2 0 0 \n", + "... ... ... ... ... \n", + "151685 1625828803804004354 5 1 1 \n", + "151686 1628004105623900167 2 0 0 \n", + "151687 1628004810183016448 6 0 0 \n", + "151688 1628352896352878593 2 0 0 \n", + "151689 1628709531998998529 10 1 0 \n", "\n", - "[11559 rows x 3 columns]" + " quote_count measured_at is_deleted \\\n", + "0 0 NaT NaN \n", + "1 0 NaT NaN \n", + "2 0 NaT NaN \n", + "3 0 NaT NaN \n", + "4 0 NaT NaN \n", + "... ... ... ... \n", + "151685 0 2023-02-19 13:40:36 False \n", + "151686 0 2023-02-25 13:14:49 False \n", + "151687 0 2023-02-25 13:14:49 False \n", + "151688 0 2023-02-26 13:15:05 False \n", + "151689 0 2023-02-27 12:17:33 False \n", + "\n", + " tweet_text created_at \\\n", + "0 @mahanna196 Da die Stadt keine Ausnahme für Ra... 2020-10-27 09:29:13 \n", + "1 @mahanna196 Ja. *sr 2020-10-27 10:35:38 \n", + "2 #Aktuell Auf dem ehem. Bundeswehrkrankenhausge... 2020-10-27 12:36:26 \n", + "3 @Emma36166433 Bitte lesen Sie unseren Tweet 2/... 2020-10-27 12:59:06 \n", + "4 In der vergangenen Woche wurde die Wohnung des... 2020-10-27 13:57:32 \n", + "... ... ... \n", + "151685 #Sicherheit durch #Sichtbarkeit\\nUnsere #Dir3 ... 2023-02-15 12:06:07 \n", + "151686 Unser Präventionsteam vom #A44 berät heute und... 2023-02-21 12:10:00 \n", + "151687 Auch unser #A52 war heute aktiv und hat zum Th... 2023-02-21 12:12:48 \n", + "151688 Gestern führte unser #A13 in einer Wohnsiedlun... 2023-02-22 11:15:58 \n", + "151689 Auf dem Gelände der @BUFAStudios (Oberlandstr.... 2023-02-23 10:53:07 \n", + "\n", + " user_id handle \\\n", + "0 778895426007203840 polizei_ol \n", + "1 778895426007203840 polizei_ol \n", + "2 778895426007203840 polizei_ol \n", + "3 778895426007203840 polizei_ol \n", + "4 778895426007203840 polizei_ol \n", + "... ... ... \n", + "151685 1168873095614160896 polizeiberlin_p \n", + "151686 1168873095614160896 polizeiberlin_p \n", + "151687 1168873095614160896 polizeiberlin_p \n", + "151688 1168873095614160896 polizeiberlin_p \n", + "151689 1168873095614160896 polizeiberlin_p \n", + "\n", + " user_name \n", + "0 Polizei Oldenburg-Stadt/Ammerland \n", + "1 Polizei Oldenburg-Stadt/Ammerland \n", + "2 Polizei Oldenburg-Stadt/Ammerland \n", + "3 Polizei Oldenburg-Stadt/Ammerland \n", + "4 Polizei Oldenburg-Stadt/Ammerland \n", + "... ... \n", + "151685 Polizei Berlin Prävention \n", + "151686 Polizei Berlin Prävention \n", + "151687 Polizei Berlin Prävention \n", + "151688 Polizei Berlin Prävention \n", + "151689 Polizei Berlin Prävention \n", + "\n", + "[151690 rows x 12 columns]" ] }, - "execution_count": 44, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tweets_meta = pd.read_csv(\"data/tweets.csv\")\n", - "tweets_time = pd.read_csv(\"data/tweets-1679742620302.csv\")\n", - "tweets_text = pd.read_csv(\"data/tweets-1679742698645.csv\")\n", - "tweets_user = pd.read_csv(\"data/tweets-1679742702794.csv\"\n", - " ).rename(columns = {\"username\":\"handle\", # rename columns\n", - " \"handle\": \"username\"})\n", - "tweets_user = tweets_user.assign(handle = tweets_user['handle'].str.lower()) # convert handles to lower case\n", - "tweets_combined = pd.merge(tweets_time, # merge the two tweet related data frames\n", - " tweets_text, \n", - " how = 'inner', \n", - " on = 'tweet_id'\n", - " ).drop(['id'], # drop unascessary id column (redundant to index)\n", - " axis = 1)\n", + "# Convert Counts to integer values\n", + "tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(-99).astype(int)\n", "tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format\n", - " created_at = pd.to_datetime(tweets_combined['created_at']))\n", - "police_stations = pd.read_csv(\"data/polizei_accounts_geo.csv\", sep = \"\\t\" # addiditional on police stations\n", - " ).rename(columns = {\"Polizei Account\": \"handle\"})\n", - "tweets_user" + " created_at = pd.to_datetime(tweets_combined['created_at']),\n", + " handle = tweets_combined['handle'].str.lower(),\n", + " is_deleted = tweets_combined['is_deleted'].map(lambda x: False if x == 0.0 else ( True if x == 1.0 else np.nan)))\n", + "tweets_combined" ] }, { @@ -337,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 112, "id": "0e5eb455-6b12-4572-8f5e-f328a94bd797", "metadata": { "tags": [] @@ -346,13 +486,13 @@ { "data": { "text/plain": [ - "hashtag 157145\n", - "url 88322\n", - "mention 36815\n", + "hashtag 267255\n", + "url 141594\n", + "mention 71142\n", "Name: entity_type, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -369,12 +509,12 @@ "tags": [] }, "source": [ - "Insgesamt haben wir 84794 einzigartige Tweets:" + "Insgesamt haben wir 151690 einzigartige Tweets:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 113, "id": "5a438e7f-8735-40bb-b450-2ce168f0f67a", "metadata": { "tags": [] @@ -383,10 +523,10 @@ { "data": { "text/plain": [ - "84794" + "151690" ] }, - "execution_count": 8, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 114, "id": "4f1e8c6c-3610-436e-899e-4d0307259230", "metadata": { "tags": [] @@ -407,12 +547,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Die Tweets wurden vom 2022-02-24 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 384 Tage.\n" + "Die Tweets wurden vom 2020-10-27 bis zum: 2023-03-16 gesammelt. Also genau insgesamt: 870 Tage. (Mit kleinen Unterbrechungen)\n" ] } ], "source": [ - "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage.\")\n", + "print(\"Die Tweets wurden vom \", tweets_combined['created_at'].min().date(), \"bis zum:\", tweets_combined['created_at'].max().date(), \"gesammelt.\", \"Also genau insgesamt:\", (tweets_combined['created_at'].max() - tweets_combined['created_at'].min()).days, \"Tage. (Mit kleinen Unterbrechungen)\")\n", "# tweets_combined[tweets_combined['created_at'] == tweets_combined['created_at'].max()] # Tweets vom letzten Tag" ] }, @@ -428,9 +568,11 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 122, "id": "9373552e-6baf-46df-ae16-c63603e20a83", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -467,7 +609,7 @@ "151690 rows × 4 columns
\n", + "163 rows × 1 columns
\n", "" ], "text/plain": [ - " like_count retweet_count reply_count quote_count\n", - "0 2 1 2 0\n", - "1 2 0 0 0\n", - "2 19 3 3 0\n", - "3 0 0 0 0\n", - "4 2 0 0 0\n", - "... ... ... ... ...\n", - "151685 5 1 1 0\n", - "151686 2 0 0 0\n", - "151687 6 0 0 0\n", - "151688 2 0 0 0\n", - "151689 10 1 0 0\n", + " count\n", + "user_id user_name \n", + "223758384 Polizei Sachsen 5340\n", + "259607457 Polizei NRW K 2544\n", + "424895827 Polizei Stuttgart 1913\n", + "769128278 Polizei NRW DO 4895\n", + "775664780 Polizei Rostock 604\n", + "... ...\n", + "1169206134189830145 Polizei Stendal 842\n", + "1184022676488314880 Polizei Pforzheim 283\n", + "1184024283342950401 Polizei Ravensburg 460\n", + "1232548941889228808 Systemstratege: 168\n", + "1295978598034284546 Polizei ZPD NI 133\n", "\n", - "[151690 rows x 4 columns]" + "[163 rows x 1 columns]" ] }, - "execution_count": 148, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "dac4e5fc-22ca-466d-bc3c-586e68696d03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "like_count\n", - "False 147573\n", - "True 4117\n", - "dtype: int64" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] + "source": [ + "tweets_combined.groupby(by = [\"user_id\", \"user_name\"]\n", + " )[\"user_id\"].aggregate(['count']\n", + " )" + ] } ], "metadata": {