Adds pickling of tfidf vectorizer

This commit is contained in:
procrastimax 2023-06-28 01:47:56 +02:00
parent 353b9f85cf
commit 4f4af74259
2 changed files with 8 additions and 2 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
.jupyter/
env/
data/tfidf_matrix.pckl
data/tfidf_vectorizer.pckl

View File

@ -4,7 +4,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
tweet_path = "data/tweets_all_combined.csv"
tfidf_pickle_path = "data/tfidf_matrix.pckl"
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevancy_score_path = "data/tweet_relevance.json"
print("Creating TFIDF Matrix")
@ -14,9 +15,13 @@ vectorizer = TfidfVectorizer()
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
print("Saving TFIDF Matrix")
with open(tfidf_pickle_path, "wb") as f:
with open(tfidf_matrix_path, "wb") as f:
pickle.dump(model, f, protocol=5)
print("Saving TFIDF vectorizer")
with open(tfidf_vectorizer_path, "wb") as f:
pickle.dump(vectorizer, f, protocol=5)
print("Calculating relevance score and saving new csv")
like_count_weight = 1.0
retweet_count_weight = 1.0