Applies gitignore changes

Adds pickles tfidf matrix to gitingore
Adds pickling of tfid and saving relevance scores
2023-06-27 23:52:20 +02:00 · 2023-06-27 23:51:43 +02:00 · 2023-06-27 23:50:26 +02:00 · 2023-06-23 19:19:19 +02:00
4 changed files with 151724 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .jupyter/
 env/
+data/tfidf_matrix.pckl
--- a/data/tweet_relevance.json
+++ b/data/tweet_relevance.json
--- a/src/general_analysis.py
+++ b/src/general_analysis.py
@ -22,6 +22,8 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da
 date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
 day_diff = (date_last_tweet - date_first_tweet).days

+print(date_last_tweet)
+

 def hr_func(ts):
    return ts.hour
--- a/src/search_index.py
+++ b/src/search_index.py
@ -0,0 +1,30 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pickle
+
+tweet_path = "data/tweets_all_combined.csv"
+tfidf_pickle_path = "data/tfidf_matrix.pckl"
+relevancy_score_path = "data/tweet_relevance.json"
+
+print("Creating TFIDF Matrix")
+tweets = pd.read_csv(tweet_path)
+vectorizer = TfidfVectorizer()
+# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
+model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
+
+print("Saving TFIDF Matrix")
+with open(tfidf_pickle_path, "wb") as f:
+    pickle.dump(model, f, protocol=5)
+
+print("Calculating relevance score and saving new csv")
+like_count_weight = 1.0
+retweet_count_weight = 1.0
+reply_count_weight = 1.0
+quote_count_weight = 1.0
+tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
+
+print("Saving relevance_scores as csv")
+with open(relevancy_score_path, "w") as f:
+    # we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
+    tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)
Author	SHA1	Message	Date
procrastimax	b2b903f45a	Applies gitignore changes	2023-06-27 23:52:20 +02:00
procrastimax	8fcbb40b2a	Adds pickles tfidf matrix to gitingore	2023-06-27 23:51:43 +02:00
procrastimax	d1348391f9	Adds pickling of tfid and saving relevance scores	2023-06-27 23:50:26 +02:00
procrastimax	12cfbc5222	Adds basic code for searchable tf-idf matrix	2023-06-23 19:19:19 +02:00