Compare commits

...

4 Commits

4 changed files with 151724 additions and 0 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
.jupyter/
env/
data/tfidf_matrix.pckl

151691
data/tweet_relevance.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,8 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
day_diff = (date_last_tweet - date_first_tweet).days
print(date_last_tweet)
def hr_func(ts):
return ts.hour

30
src/search_index.py Normal file
View File

@ -0,0 +1,30 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
tweet_path = "data/tweets_all_combined.csv"
tfidf_pickle_path = "data/tfidf_matrix.pckl"
relevancy_score_path = "data/tweet_relevance.json"
print("Creating TFIDF Matrix")
tweets = pd.read_csv(tweet_path)
vectorizer = TfidfVectorizer()
# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
print("Saving TFIDF Matrix")
with open(tfidf_pickle_path, "wb") as f:
pickle.dump(model, f, protocol=5)
print("Calculating relevance score and saving new csv")
like_count_weight = 1.0
retweet_count_weight = 1.0
reply_count_weight = 1.0
quote_count_weight = 1.0
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
print("Saving relevance_scores as csv")
with open(relevancy_score_path, "w") as f:
# we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)