From 4f4af742593bc306ddcdaa1b293edc66dc2eb182 Mon Sep 17 00:00:00 2001 From: procrastimax Date: Wed, 28 Jun 2023 01:47:56 +0200 Subject: [PATCH] Adds pickling of tfidf vectorizer --- .gitignore | 1 + src/search_index.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 823df4df..2de63a7d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .jupyter/ env/ data/tfidf_matrix.pckl +data/tfidf_vectorizer.pckl diff --git a/src/search_index.py b/src/search_index.py index 49a68cbf..e18ffb0e 100644 --- a/src/search_index.py +++ b/src/search_index.py @@ -4,7 +4,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer import pickle tweet_path = "data/tweets_all_combined.csv" -tfidf_pickle_path = "data/tfidf_matrix.pckl" +tfidf_matrix_path = "data/tfidf_matrix.pckl" +tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" relevancy_score_path = "data/tweet_relevance.json" print("Creating TFIDF Matrix") @@ -14,9 +15,13 @@ vectorizer = TfidfVectorizer() model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]]) print("Saving TFIDF Matrix") -with open(tfidf_pickle_path, "wb") as f: +with open(tfidf_matrix_path, "wb") as f: pickle.dump(model, f, protocol=5) +print("Saving TFIDF vectorizer") +with open(tfidf_vectorizer_path, "wb") as f: + pickle.dump(vectorizer, f, protocol=5) + print("Calculating relevance score and saving new csv") like_count_weight = 1.0 retweet_count_weight = 1.0