Adds stopwords and ngram-range to tfidf-vectorizer

This commit is contained in:
procrastimax 2023-06-28 12:34:22 +02:00
parent 4f4af74259
commit 804f966a00
3 changed files with 642 additions and 168 deletions

View file

@ -2,16 +2,30 @@ import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import sys
tweet_path = "data/tweets_all_combined.csv"
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevancy_score_path = "data/tweet_relevance.json"
stopwords_path = "data/stopwords-de.txt"
print("Reading in stopwords")
with open(stopwords_path, "r") as f:
stopwords = f.read()
stopword_list = stopwords.split("\n")
stopword_list = list(filter(lambda x: len(x) > 0, stopword_list))
print("Creating TFIDF Matrix")
tweets = pd.read_csv(tweet_path)
vectorizer = TfidfVectorizer()
# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
vectorizer = TfidfVectorizer(lowercase=True,
analyzer="word",
stop_words=stopword_list,
ngram_range=(1, 3),
max_df=0.8)
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
print("Saving TFIDF Matrix")
@ -27,9 +41,11 @@ like_count_weight = 1.0
retweet_count_weight = 1.0
reply_count_weight = 1.0
quote_count_weight = 1.0
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] *
retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
print("Saving relevance_scores as csv")
with open(relevancy_score_path, "w") as f:
# we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)
tweets[["tweet_id", "relevance_score"]].to_csv(
relevancy_score_path, header=True, index=False, na_rep=1.0)