copbird_aufarbeitung/merge_police_tweets.py

import numpy as np
import pandas as pd

# Merging different table of old (~2021) and new (~2022) scraper

## cols: hashtag, url, mention (same for both)
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper
                         pd.read_csv("data/tweets.csv")]) # data from new scraper

## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)
tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),
                         pd.read_csv("data/tweets-1679742698645.csv")])

## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table
tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),
                              pd.read_csv("data/tweets-1679742620302.csv")])

## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)
## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged
tweets_user = pd.read_csv("data/user_old.tsv", 
                          sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names
                                            ).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data
                                                               ).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names
                                                   on = "user_id", # user_id as matching column
                                                   how = "outer", # keep all unique uer_ids
                                                   suffixes = ["_2021", "_2022"]) # identify column where username and label came from

## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.
tweets_user = tweets_user.assign(handle    = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),
                                 user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)
                                ).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed

## addiditional information concerning the police stations
## cols: handle, name, typ, bundesland, stadt, lat, long
police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"  
                             ).rename(columns = {"Polizei Account": "handle"})

# Merge statistics, tweet text and user information in one data frame
tweets_combined = pd.merge(tweets_statistics, 
                           tweets_text,
                           on = 'tweet_id').merge(tweets_user, on = 'user_id'
                                                 ).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)

# Convert datatypes to appropriate one
tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)
tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format
                                         created_at  = pd.to_datetime(tweets_combined['created_at']),
                                         handle      = tweets_combined['handle'].str.lower(), # handle to lower case
                                         is_deleted  = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable
tweets_combined.#to_csv("data/tweets_all_combined.csv")
finished tidying data 2023-03-28 13:35:03 +00:00			`import numpy as np`
			`import pandas as pd`

			`# Merging different table of old (~2021) and new (~2022) scraper`

			`## cols: hashtag, url, mention (same for both)`
			`tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep = "\t"), # data from old scraper`
			`pd.read_csv("data/tweets.csv")]) # data from new scraper`

			`## cols: id, tweet_text, created_at, user_id; only subset from old table (same tsv used in next step)`
			`tweets_text = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id','tweet_text', 'created_at', 'user_id']].rename(columns = {"id":"tweet_id"}),`
			`pd.read_csv("data/tweets-1679742698645.csv")])`

			`## cols: id, like_count, retweet_count, reply_count, quote_count; only subset from old table`
			`tweets_statistics = pd.concat([pd.read_csv("data/tweet_old.tsv", sep = "\t")[['id', 'like_count', 'retweet_count', 'reply_count', 'quote_count']].rename(columns = {"id":"tweet_id"}),`
			`pd.read_csv("data/tweets-1679742620302.csv")])`

			`## cols: user_id, handle, user_name; colnames do not match betweend old an new data. Even username and handle seem to be mixed up in new data set (inverse order)`
			`## Info: Only a small amount of user_ids appear in both data sets, but if so username occasionaly have changed an therefore can not easily be merged`
			`tweets_user = pd.read_csv("data/user_old.tsv",`
			`sep = "\t").rename(columns = {"id":"user_id","name": "user_name"} # uniform names`
			`).merge(pd.read_csv("data/tweets-1679742702794.csv" # merge with renamed new data`
			`).rename(columns = {"username":"handle", "handle": "user_name"}), # reverse col names`
			`on = "user_id", # user_id as matching column`
			`how = "outer", # keep all unique uer_ids`
			`suffixes = ["_2021", "_2022"]) # identify column where username and label came from`

			`## Some usernames corresponding to one user_id have changed overtime. For easier handling only the latest username and handle is kept.`
			`tweets_user = tweets_user.assign(handle = tweets_user.apply(lambda row: row['handle_2021'] if pd.isna(row['handle_2022']) else row['handle_2022'], axis=1),`
			`user_name = tweets_user.apply(lambda row: row['user_name_2021'] if pd.isna(row['user_name_2022']) else row['user_name_2022'], axis=1)`
			`).drop(['handle_2021', 'handle_2022', 'user_name_2021', 'user_name_2022'], axis =1) # no longer needed`

			`## addiditional information concerning the police stations`
			`## cols: handle, name, typ, bundesland, stadt, lat, long`
			`police_stations = pd.read_csv("data/polizei_accounts_geo.csv", sep = "\t"`
			`).rename(columns = {"Polizei Account": "handle"})`

			`# Merge statistics, tweet text and user information in one data frame`
			`tweets_combined = pd.merge(tweets_statistics,`
			`tweets_text,`
			`on = 'tweet_id').merge(tweets_user, on = 'user_id'`
			`).drop(['id'], axis = 1) # drop unascessary id column (redundant to index)`

			`# Convert datatypes to appropriate one`
			`tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']] = tweets_combined[['like_count', 'retweet_count', 'reply_count', 'quote_count']].fillna(np.NAN).astype(int)`
			`tweets_combined = tweets_combined.assign(measured_at = pd.to_datetime(tweets_combined['measured_at']), # change date to date format`
			`created_at = pd.to_datetime(tweets_combined['created_at']),`
			`handle = tweets_combined['handle'].str.lower(), # handle to lower case`
			`is_deleted = tweets_combined['is_deleted'].astype('boolean')) # is deleted column as boolean variable`
			`tweets_combined.#to_csv("data/tweets_all_combined.csv")`