Adds pickling of tfid and saving relevance scores

2023-06-27 23:50:26 +02:00 · 2023-06-27 23:50:26 +02:00 · d1348391f9
commit d1348391f9
parent 12cfbc5222
3 changed files with 151705 additions and 26 deletions
--- a/data/tfidf_matrix.pckl
+++ b/data/tfidf_matrix.pckl
--- a/data/tweet_relevance.json
+++ b/data/tweet_relevance.json
--- a/src/search_index.py
+++ b/src/search_index.py
@ -1,42 +1,30 @@
 import pandas as pd
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
+import pickle

 tweet_path = "data/tweets_all_combined.csv"
+tfidf_pickle_path = "data/tfidf_matrix.pckl"
+relevancy_score_path = "data/tweet_relevance.json"

+print("Creating TFIDF Matrix")
 tweets = pd.read_csv(tweet_path)
-
-print(tweets.head())
-print(f"shape: {tweets.shape}")
-print(f"keys: {tweets.keys()}")
 vectorizer = TfidfVectorizer()
+# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
 model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])

-query = "@dima973"
-
-query_vec = vectorizer.transform([query])
-similarity = cosine_similarity(query_vec, model).flatten()
-
-
-# only return stuff if there is acatually a good match for it
-match_idx = np.where(similarity != 0)[0]
-indices = np.argsort(-similarity[match_idx])
-correct_indices = match_idx[indices]
-result = tweets.iloc[correct_indices]
-
+print("Saving TFIDF Matrix")
+with open(tfidf_pickle_path, "wb") as f:
+    pickle.dump(model, f, protocol=5)

+print("Calculating relevance score and saving new csv")
 like_count_weight = 1.0
 retweet_count_weight = 1.0
 reply_count_weight = 1.0
 quote_count_weight = 1.0
+tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))

-# TODO: maybe come up with a more intelligent relevancy_score calculation
-tweets["relevancy_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
-
-result = tweets.iloc[correct_indices]
-overall = result["relevancy_score"] * similarity[correct_indices]
-print(result.loc[overall.sort_values(ascending=False).index][["tweet_text", "like_count", "retweet_count", "reply_count", "quote_count", "relevancy_score"]].head())
-
-# TODO: save trained TfidfVectorizer matrix as pickle
-# TODO: save csv with relevancy score
+print("Saving relevance_scores as csv")
+with open(relevancy_score_path, "w") as f:
+    # we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
+    tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)