Refactors repo structure

2023-06-17 12:17:15 +02:00 · 2023-06-17 12:17:15 +02:00 · 5ccecf9a90
commit 5ccecf9a90
parent 9450ec940b
31208 changed files with 55 additions and 4559521 deletions
--- a/src/merge_police_tweets.py
+++ b/src/merge_police_tweets.py
@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+
+# Merging different table of old (~2021) and new (~2022) scraper
+
+# cols: hashtag, url, mention (same for both)
+tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep="\t"),  # data from old scraper
+                         pd.read_csv("data/tweets.csv")])  # data from new scraper
+
+# cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
+tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep="\t")[['id', 'tweet_text', 'created_at', 'user_id']].rename(columns={"id": "tweet_id"}),
+                         pd.read_csv("data/tweets-1679742698645.csv")])
+
+# cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
+tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep="\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns={"id": "tweet_id"}),
+                              pd.read_csv("data/tweets-1679742620302.csv")])
+
+# cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
+# Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
+tweets_user = pd.read_csv("data/user_old.tsv",
+                          # uniform names
+                          sep="\t").rename(columns={"id": "user_id", "name": "user_name"}
+                                           ).merge(pd.read_csv("data/tweets-1679742702794.csv"  # merge with renamed new data
+                                                               ).rename(columns={"username": "handle", "handle": "user_name"}),  # reverse col names
+                                                   on="user_id",  # user_id as matching column
+                                                   how="outer",  # keep all unique uer_ids
+                                                   suffixes=["_2021", "_2022"])  # identify column where username and label came from
+
+# Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
+tweets_user = tweets_user.assign(handle=tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
+                                 user_name=tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(
+                                     row['user_name_2022']) else row['user_name_2022'], axis=1)
+                                 ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis=1)  # no longer needed
+
+# addiditional information concerning the police stations
+# cols: handle, name, typ, bundesland, stadt, lat, long
+police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep="\t"
+                              ).rename(columns={"Polizei Account": "handle"})
+
+# Merge statistics, tweet text and user information in one data frame
+tweets_combined = pd.merge(tweets_statistics,
+                           tweets_text,
+                           on='tweet_id').merge(tweets_user, on='user_id'
+                                                ).drop(['id'], axis=1)  # drop unascessary id column (redundant to index)
+
+# Convert datatypes to appropriate one
+tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[[
+    'like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
+tweets_combined = tweets_combined.assign(measured_at=pd.to_datetime(tweets_combined['measured_at']),  # change date to date format
+                                         created_at=pd.to_datetime(tweets_combined['created_at']),
+                                         # handle to lower case
+                                         handle=tweets_combined['handle'].str.lower(),
+                                         is_deleted=tweets_combined['is_deleted'].astype('boolean'))  # is deleted column as boolean variable
+tweets_combined.to_csv("data/tweets_all_combined.csv")