Adds script for general dataset anaylsis

Adds env/ to gitignore
2023-06-17 16:54:56 +02:00 · 2023-06-17 12:21:03 +02:00
3 changed files with 57 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 .jupyter/
 env/
--- a/data/general_analysis_results.json
+++ b/data/general_analysis_results.json
@ -0,0 +1 @@
 {"hashtags": 267255, "mention": 71142, "url": 141594, "tweet_count": 151690, "num_police_accounts": 163, "date_first_tweet": "2020-10-27 09:29:13", "date_last_tweet": "2023-03-16 11:42:58", "day_diff": 870, "avg_post_hour": 11.156780275562001, "num_text_tokens": 3764759}
--- a/src/general_analysis.py
+++ b/src/general_analysis.py
@ -0,0 +1,55 @@
 import pandas as pd
 from datetime import datetime
 import json
 general_analysis_dict = {}
 date_format_str = "%Y-%m-%d %H:%M:%S"
 tweets_all_combined = pd.read_csv("data/tweets_all_combined.csv")
 tweets_meta = pd.concat([pd.read_csv("data/entity_old.tsv", sep="\t"),
                         pd.read_csv("data/tweets.csv")])
 meta_infos = tweets_meta["entity_type"].value_counts()
 hashtags = meta_infos["hashtag"]
 mention = meta_infos["mention"]
 url = meta_infos["url"]
 count_tweets = tweets_all_combined["tweet_id"].value_counts().shape[0]
 count_police_accounts = tweets_all_combined["user_id"].value_counts().shape[0]
 date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), date_format_str)
 date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
 day_diff = (date_last_tweet-date_first_tweet).days
 def hr_func(ts):
    return ts.hour
 tweets_all_combined["created_at"] = pd.to_datetime(
    tweets_all_combined["created_at"])
 tweets_all_combined["created_hour"] = tweets_all_combined["created_at"].apply(hr_func)
 avg_post_hour = tweets_all_combined["created_hour"].mean()
 num_text_tokens = tweets_all_combined["tweet_text"].str.split().apply(len).sum()
 general_analysis_dict["hashtags"] = int(hashtags)
 general_analysis_dict["mention"] = int(mention)
 general_analysis_dict["url"] = int(url)
 general_analysis_dict["tweet_count"] = int(count_tweets)
 general_analysis_dict["num_police_accounts"] = int(count_police_accounts)
 general_analysis_dict["date_first_tweet"] = date_first_tweet.strftime(date_format_str)
 general_analysis_dict["date_last_tweet"] = date_last_tweet.strftime(date_format_str)
 general_analysis_dict["day_diff"] = int(day_diff)
 general_analysis_dict["avg_post_hour"] = float(avg_post_hour)
 general_analysis_dict["num_text_tokens"] = int(num_text_tokens)
 print("Writing general analysis json dump")
 with open("data/general_analysis_results.json", "w") as f:
    json.dump(general_analysis_dict, f)
Author	SHA1	Message	Date
procrastimax	2c50cc9b72	Adds script for general dataset anaylsis	2023-06-17 16:54:56 +02:00
procrastimax	8647f32ca8	Adds env/ to gitignore	2023-06-17 12:21:03 +02:00
		`@ -0,0 +1 @@`
							`{"hashtags": 267255, "mention": 71142, "url": 141594, "tweet_count": 151690, "num_police_accounts": 163, "date_first_tweet": "2020-10-27 09:29:13", "date_last_tweet": "2023-03-16 11:42:58", "day_diff": 870, "avg_post_hour": 11.156780275562001, "num_text_tokens": 3764759}`