diff --git a/.gitignore b/.gitignore index 398e2c16..f5c53959 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ .jupyter/ -env/ diff --git a/data/general_analysis_results.json b/data/general_analysis_results.json deleted file mode 100644 index 1b6ca61d..00000000 --- a/data/general_analysis_results.json +++ /dev/null @@ -1 +0,0 @@ -{"hashtags": 267255, "mention": 71142, "url": 141594, "tweet_count": 151690, "num_police_accounts": 163, "date_first_tweet": "2020-10-27 09:29:13", "date_last_tweet": "2023-03-16 11:42:58", "day_diff": 870, "avg_post_hour": 11.156780275562001, "num_text_tokens": 3764759} \ No newline at end of file diff --git a/src/general_analysis.py b/src/general_analysis.py deleted file mode 100644 index 2e5d2807..00000000 --- a/src/general_analysis.py +++ /dev/null @@ -1,55 +0,0 @@ -import pandas as pd -from datetime import datetime -import json - -general_analysis_dict = {} - -date_format_str = "%Y-%m-%d %H:%M:%S" - -tweets_all_combined = pd.read_csv("data/tweets_all_combined.csv") - -tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep="\t"), - pd.read_csv("data/tweets.csv")]) - -meta_infos = tweets_meta["entity_type"].value_counts() -hashtags = meta_infos["hashtag"] -mention = meta_infos["mention"] -url = meta_infos["url"] - -count_tweets = tweets_all_combined["tweet_id"].value_counts().shape[0] -count_police_accounts = tweets_all_combined["user_id"].value_counts().shape[0] -date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), date_format_str) -date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str) -day_diff = (date_last_tweet-date_first_tweet).days - - -def hr_func(ts): - return ts.hour - - -tweets_all_combined["created_at"] = pd.to_datetime( - tweets_all_combined["created_at"]) - -tweets_all_combined["created_hour"] = tweets_all_combined["created_at"].apply(hr_func) - -avg_post_hour = tweets_all_combined["created_hour"].mean() - - -num_text_tokens = tweets_all_combined["tweet_text"].str.split().apply(len).sum() - - -general_analysis_dict["hashtags"] = int(hashtags) -general_analysis_dict["mention"] = int(mention) -general_analysis_dict["url"] = int(url) -general_analysis_dict["tweet_count"] = int(count_tweets) -general_analysis_dict["num_police_accounts"] = int(count_police_accounts) -general_analysis_dict["date_first_tweet"] = date_first_tweet.strftime(date_format_str) -general_analysis_dict["date_last_tweet"] = date_last_tweet.strftime(date_format_str) -general_analysis_dict["day_diff"] = int(day_diff) -general_analysis_dict["avg_post_hour"] = float(avg_post_hour) -general_analysis_dict["num_text_tokens"] = int(num_text_tokens) - - -print("Writing general analysis json dump") -with open("data/general_analysis_results.json", "w") as f: - json.dump(general_analysis_dict, f)