Adds stopwords and ngram-range to tfidf-vectorizer
This commit is contained in:
parent
4f4af74259
commit
804f966a00
3 changed files with 642 additions and 168 deletions
|
|
@ -2,16 +2,30 @@ import pandas as pd
|
|||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
tweet_path = "data/tweets_all_combined.csv"
|
||||
tfidf_matrix_path = "data/tfidf_matrix.pckl"
|
||||
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
|
||||
relevancy_score_path = "data/tweet_relevance.json"
|
||||
stopwords_path = "data/stopwords-de.txt"
|
||||
|
||||
print("Reading in stopwords")
|
||||
with open(stopwords_path, "r") as f:
|
||||
stopwords = f.read()
|
||||
|
||||
stopword_list = stopwords.split("\n")
|
||||
stopword_list = list(filter(lambda x: len(x) > 0, stopword_list))
|
||||
|
||||
|
||||
print("Creating TFIDF Matrix")
|
||||
tweets = pd.read_csv(tweet_path)
|
||||
vectorizer = TfidfVectorizer()
|
||||
# TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed?
|
||||
vectorizer = TfidfVectorizer(lowercase=True,
|
||||
analyzer="word",
|
||||
stop_words=stopword_list,
|
||||
ngram_range=(1, 3),
|
||||
max_df=0.8)
|
||||
|
||||
model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]])
|
||||
|
||||
print("Saving TFIDF Matrix")
|
||||
|
|
@ -27,9 +41,11 @@ like_count_weight = 1.0
|
|||
retweet_count_weight = 1.0
|
||||
reply_count_weight = 1.0
|
||||
quote_count_weight = 1.0
|
||||
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
|
||||
tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] *
|
||||
retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight))
|
||||
|
||||
print("Saving relevance_scores as csv")
|
||||
with open(relevancy_score_path, "w") as f:
|
||||
# we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0'
|
||||
tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0)
|
||||
tweets[["tweet_id", "relevance_score"]].to_csv(
|
||||
relevancy_score_path, header=True, index=False, na_rep=1.0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue