Adds basic code for searchable tf-idf matrix
This commit is contained in:
parent
2684b4b8c7
commit
12cfbc5222
2 changed files with 44 additions and 0 deletions
|
@ -22,6 +22,8 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da
|
||||||
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
|
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
|
||||||
day_diff = (date_last_tweet - date_first_tweet).days
|
day_diff = (date_last_tweet - date_first_tweet).days
|
||||||
|
|
||||||
|
print(date_last_tweet)
|
||||||
|
|
||||||
|
|
||||||
def hr_func(ts):
|
def hr_func(ts):
|
||||||
return ts.hour
|
return ts.hour
|
||||||
|
|
42
src/search_index.py
Normal file
42
src/search_index.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
tweet_path = "data/tweets_all_combined.csv"
|
||||||
|
|
||||||
|
tweets = pd.read_csv(tweet_path)
|
||||||
|
|
||||||
|
print(tweets.head())
|
||||||
|
print(f"shape: {tweets.shape}")
|
||||||
|
print(f"keys: {tweets.keys()}")
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
|
||||||
|
|
||||||
|
query = "@dima973"
|
||||||
|
|
||||||
|
query_vec = vectorizer.transform([query])
|
||||||
|
similarity = cosine_similarity(query_vec, model).flatten()
|
||||||
|
|
||||||
|
|
||||||
|
# only return stuff if there is acatually a good match for it
|
||||||
|
match_idx = np.where(similarity != 0)[0]
|
||||||
|
indices = np.argsort(-similarity[match_idx])
|
||||||
|
correct_indices = match_idx[indices]
|
||||||
|
result = tweets.iloc[correct_indices]
|
||||||
|
|
||||||
|
|
||||||
|
like_count_weight = 1.0
|
||||||
|
retweet_count_weight = 1.0
|
||||||
|
reply_count_weight = 1.0
|
||||||
|
quote_count_weight = 1.0
|
||||||
|
|
||||||
|
# TODO: maybe come up with a more intelligent relevancy_score calculation
|
||||||
|
tweets["relevancy_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
|
||||||
|
|
||||||
|
result = tweets.iloc[correct_indices]
|
||||||
|
overall = result["relevancy_score"] * similarity[correct_indices]
|
||||||
|
print(result.loc[overall.sort_values(ascending=False).index][["tweet_text", "like_count", "retweet_count", "reply_count", "quote_count", "relevancy_score"]].head())
|
||||||
|
|
||||||
|
# TODO: save trained TfidfVectorizer matrix as pickle
|
||||||
|
# TODO: save csv with relevancy score
|
Loading…
Reference in a new issue