From 4f4af742593bc306ddcdaa1b293edc66dc2eb182 Mon Sep 17 00:00:00 2001
From: procrastimax <mmeheykeroth@protonmail.com>
Date: Wed, 28 Jun 2023 01:47:56 +0200
Subject: [PATCH] Adds pickling of tfidf vectorizer

---
 .gitignore          | 1 +
 src/search_index.py | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 823df4df..2de63a7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .jupyter/
 env/
 data/tfidf_matrix.pckl
+data/tfidf_vectorizer.pckl
diff --git a/src/search_index.py b/src/search_index.py
index 49a68cbf..e18ffb0e 100644
--- a/src/search_index.py
+++ b/src/search_index.py
@@ -4,7 +4,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 import pickle
 
 tweet_path = "data/tweets_all_combined.csv"
-tfidf_pickle_path = "data/tfidf_matrix.pckl"
+tfidf_matrix_path = "data/tfidf_matrix.pckl"
+tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
 relevancy_score_path = "data/tweet_relevance.json"
 
 print("Creating TFIDF Matrix")
@@ -14,9 +15,13 @@ vectorizer = TfidfVectorizer()
 model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
 
 print("Saving TFIDF Matrix")
-with open(tfidf_pickle_path, "wb") as f:
+with open(tfidf_matrix_path, "wb") as f:
     pickle.dump(model, f, protocol=5)
 
+print("Saving TFIDF vectorizer")
+with open(tfidf_vectorizer_path, "wb") as f:
+    pickle.dump(vectorizer, f, protocol=5)
+
 print("Calculating relevance score and saving new csv")
 like_count_weight = 1.0
 retweet_count_weight = 1.0