From 12cfbc5222a8849c4a22e633a539d7f1d99c2b84 Mon Sep 17 00:00:00 2001 From: procrastimax Date: Fri, 23 Jun 2023 19:19:19 +0200 Subject: [PATCH] Adds basic code for searchable tf-idf matrix --- src/general_analysis.py | 2 ++ src/search_index.py | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 src/search_index.py diff --git a/src/general_analysis.py b/src/general_analysis.py index 6f4266a1..ea76197a 100644 --- a/src/general_analysis.py +++ b/src/general_analysis.py @@ -22,6 +22,8 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str) day_diff = (date_last_tweet - date_first_tweet).days +print(date_last_tweet) + def hr_func(ts): return ts.hour diff --git a/src/search_index.py b/src/search_index.py new file mode 100644 index 00000000..c2a578bb --- /dev/null +++ b/src/search_index.py @@ -0,0 +1,42 @@ +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +tweet_path = "data/tweets_all_combined.csv" + +tweets = pd.read_csv(tweet_path) + +print(tweets.head()) +print(f"shape: {tweets.shape}") +print(f"keys: {tweets.keys()}") +vectorizer = TfidfVectorizer() +model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]]) + +query = "@dima973" + +query_vec = vectorizer.transform([query]) +similarity = cosine_similarity(query_vec, model).flatten() + + +# only return stuff if there is acatually a good match for it +match_idx = np.where(similarity != 0)[0] +indices = np.argsort(-similarity[match_idx]) +correct_indices = match_idx[indices] +result = tweets.iloc[correct_indices] + + +like_count_weight = 1.0 +retweet_count_weight = 1.0 +reply_count_weight = 1.0 +quote_count_weight = 1.0 + +# TODO: maybe come up with a more intelligent relevancy_score calculation +tweets["relevancy_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight)) + +result = tweets.iloc[correct_indices] +overall = result["relevancy_score"] * similarity[correct_indices] +print(result.loc[overall.sort_values(ascending=False).index][["tweet_text", "like_count", "retweet_count", "reply_count", "quote_count", "relevancy_score"]].head()) + +# TODO: save trained TfidfVectorizer matrix as pickle +# TODO: save csv with relevancy score