Compare commits
No commits in common. "b2b903f45a9c4c753c7ee95152a926cbc8d6ceb4" and "2684b4b8c7deba112966966c6ae5ff1234c398e8" have entirely different histories.
b2b903f45a
...
2684b4b8c7
4 changed files with 0 additions and 151724 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,2 @@
|
||||||
.jupyter/
|
.jupyter/
|
||||||
env/
|
env/
|
||||||
data/tfidf_matrix.pckl
|
|
||||||
|
|
151691
data/tweet_relevance.json
151691
data/tweet_relevance.json
File diff suppressed because it is too large
Load diff
|
@ -22,8 +22,6 @@ date_first_tweet = datetime.strptime(tweets_all_combined['created_at'].min(), da
|
||||||
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
|
date_last_tweet = datetime.strptime(tweets_all_combined['created_at'].max(), date_format_str)
|
||||||
day_diff = (date_last_tweet - date_first_tweet).days
|
day_diff = (date_last_tweet - date_first_tweet).days
|
||||||
|
|
||||||
print(date_last_tweet)
|
|
||||||
|
|
||||||
|
|
||||||
def hr_func(ts):
|
def hr_func(ts):
|
||||||
return ts.hour
|
return ts.hour
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
tweet_path = "data/tweets_all_combined.csv"
|
|
||||||
tfidf_pickle_path = "data/tfidf_matrix.pckl"
|
|
||||||
relevancy_score_path = "data/tweet_relevance.json"
|
|
||||||
|
|
||||||
print("Creating TFIDF Matrix")
|
|
||||||
tweets = pd.read_csv(tweet_path)
|
|
||||||
vectorizer = TfidfVectorizer()
|
|
||||||
# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
|
|
||||||
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
|
|
||||||
|
|
||||||
print("Saving TFIDF Matrix")
|
|
||||||
with open(tfidf_pickle_path, "wb") as f:
|
|
||||||
pickle.dump(model, f, protocol=5)
|
|
||||||
|
|
||||||
print("Calculating relevance score and saving new csv")
|
|
||||||
like_count_weight = 1.0
|
|
||||||
retweet_count_weight = 1.0
|
|
||||||
reply_count_weight = 1.0
|
|
||||||
quote_count_weight = 1.0
|
|
||||||
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
|
|
||||||
|
|
||||||
print("Saving relevance_scores as csv")
|
|
||||||
with open(relevancy_score_path, "w") as f:
|
|
||||||
# we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
|
|
||||||
tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)
|
|
Loading…
Reference in a new issue