From 12cfbc5222a8849c4a22e633a539d7f1d99c2b84 Mon Sep 17 00:00:00 2001
From: procrastimax <mmeheykeroth@protonmail.com>
Date: Fri, 23 Jun 2023 19:19:19 +0200
Subject: [PATCH] Adds basic code for searchable tf-idf matrix

---
 src/general_analysis.py |  2 ++
 src/search_index.py     | 42 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 src/search_index.py

diff --git a/src/general_analysis.py b/src/general_analysis.py
index 6f4266a1..ea76197a 100644
--- a/src/general_analysis.py
+++ b/src/general_analysis.py
@@ -22,6 +22,8 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da
 date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
 day_diff = (date_last_tweet - date_first_tweet).days
 
+print(date_last_tweet)
+
 
 def hr_func(ts):
     return ts.hour
diff --git a/src/search_index.py b/src/search_index.py
new file mode 100644
index 00000000..c2a578bb
--- /dev/null
+++ b/src/search_index.py
@@ -0,0 +1,42 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+tweet_path = "data/tweets_all_combined.csv"
+
+tweets = pd.read_csv(tweet_path)
+
+print(tweets.head())
+print(f"shape: {tweets.shape}")
+print(f"keys: {tweets.keys()}")
+vectorizer = TfidfVectorizer()
+model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
+
+query = "@dima973"
+
+query_vec = vectorizer.transform([query])
+similarity = cosine_similarity(query_vec, model).flatten()
+
+
+# only return stuff if there is acatually a good match for it
+match_idx = np.where(similarity != 0)[0]
+indices = np.argsort(-similarity[match_idx])
+correct_indices = match_idx[indices]
+result = tweets.iloc[correct_indices]
+
+
+like_count_weight = 1.0
+retweet_count_weight = 1.0
+reply_count_weight = 1.0
+quote_count_weight = 1.0
+
+# TODO: maybe come up with a more intelligent relevancy_score calculation
+tweets["relevancy_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
+
+result = tweets.iloc[correct_indices]
+overall = result["relevancy_score"] * similarity[correct_indices]
+print(result.loc[overall.sort_values(ascending=False).index][["tweet_text", "like_count", "retweet_count", "reply_count", "quote_count", "relevancy_score"]].head())
+
+# TODO: save trained TfidfVectorizer matrix as pickle
+# TODO: save csv with relevancy score