|
|
|
@ -0,0 +1,55 @@
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
general_analysis_dict = {}
|
|
|
|
|
|
|
|
|
|
date_format_str = "%Y-%m-%d %H:%M:%S"
|
|
|
|
|
|
|
|
|
|
tweets_all_combined = pd.read_csv("data/tweets_all_combined.csv")
|
|
|
|
|
|
|
|
|
|
tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep="\t"),
|
|
|
|
|
pd.read_csv("data/tweets.csv")])
|
|
|
|
|
|
|
|
|
|
meta_infos = tweets_meta["entity_type"].value_counts()
|
|
|
|
|
hashtags = meta_infos["hashtag"]
|
|
|
|
|
mention = meta_infos["mention"]
|
|
|
|
|
url = meta_infos["url"]
|
|
|
|
|
|
|
|
|
|
count_tweets = tweets_all_combined["tweet_id"].value_counts().shape[0]
|
|
|
|
|
count_police_accounts = tweets_all_combined["user_id"].value_counts().shape[0]
|
|
|
|
|
date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), date_format_str)
|
|
|
|
|
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
|
|
|
|
|
day_diff = (date_last_tweet-date_first_tweet).days
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hr_func(ts):
|
|
|
|
|
return ts.hour
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tweets_all_combined["created_at"] = pd.to_datetime(
|
|
|
|
|
tweets_all_combined["created_at"])
|
|
|
|
|
|
|
|
|
|
tweets_all_combined["created_hour"] = tweets_all_combined["created_at"].apply(hr_func)
|
|
|
|
|
|
|
|
|
|
avg_post_hour = tweets_all_combined["created_hour"].mean()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_text_tokens = tweets_all_combined["tweet_text"].str.split().apply(len).sum()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
general_analysis_dict["hashtags"] = int(hashtags)
|
|
|
|
|
general_analysis_dict["mention"] = int(mention)
|
|
|
|
|
general_analysis_dict["url"] = int(url)
|
|
|
|
|
general_analysis_dict["tweet_count"] = int(count_tweets)
|
|
|
|
|
general_analysis_dict["num_police_accounts"] = int(count_police_accounts)
|
|
|
|
|
general_analysis_dict["date_first_tweet"] = date_first_tweet.strftime(date_format_str)
|
|
|
|
|
general_analysis_dict["date_last_tweet"] = date_last_tweet.strftime(date_format_str)
|
|
|
|
|
general_analysis_dict["day_diff"] = int(day_diff)
|
|
|
|
|
general_analysis_dict["avg_post_hour"] = float(avg_post_hour)
|
|
|
|
|
general_analysis_dict["num_text_tokens"] = int(num_text_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Writing general analysis json dump")
|
|
|
|
|
with open("data/general_analysis_results.json", "w") as f:
|
|
|
|
|
json.dump(general_analysis_dict, f)
|