Adds stopwords and ngram-range to tfidf-vectorizer
This commit is contained in:
		
							parent
							
								
									4f4af74259
								
							
						
					
					
						commit
						804f966a00
					
				
					 3 changed files with 642 additions and 168 deletions
				
			
		|  | @ -2,16 +2,30 @@ import pandas as pd | |||
| import numpy as np | ||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||
| import pickle | ||||
| import sys | ||||
| 
 | ||||
| tweet_path = "data/tweets_all_combined.csv" | ||||
| tfidf_matrix_path = "data/tfidf_matrix.pckl" | ||||
| tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" | ||||
| relevancy_score_path = "data/tweet_relevance.json" | ||||
| stopwords_path = "data/stopwords-de.txt" | ||||
| 
 | ||||
| print("Reading in stopwords") | ||||
| with open(stopwords_path, "r") as f: | ||||
|     stopwords = f.read() | ||||
| 
 | ||||
| stopword_list = stopwords.split("\n") | ||||
| stopword_list = list(filter(lambda x: len(x) > 0, stopword_list)) | ||||
| 
 | ||||
| 
 | ||||
| print("Creating TFIDF Matrix") | ||||
| tweets = pd.read_csv(tweet_path) | ||||
| vectorizer = TfidfVectorizer() | ||||
| # TODO: we could stem or lemma the words as preprocessing, but maybe this is not needed? | ||||
| vectorizer = TfidfVectorizer(lowercase=True, | ||||
|                              analyzer="word", | ||||
|                              stop_words=stopword_list, | ||||
|                              ngram_range=(1, 3), | ||||
|                              max_df=0.8) | ||||
| 
 | ||||
| model = vectorizer.fit_transform([x.lower() for x in tweets["tweet_text"]]) | ||||
| 
 | ||||
| print("Saving TFIDF Matrix") | ||||
|  | @ -27,9 +41,11 @@ like_count_weight = 1.0 | |||
| retweet_count_weight = 1.0 | ||||
| reply_count_weight = 1.0 | ||||
| quote_count_weight = 1.0 | ||||
| tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight)) | ||||
| tweets["relevance_score"] = np.log(1 + (tweets["like_count"] * like_count_weight) + (tweets["retweet_count"] * | ||||
|                                    retweet_count_weight) + (tweets["reply_count"] * reply_count_weight) + (tweets["quote_count"] * quote_count_weight)) | ||||
| 
 | ||||
| print("Saving relevance_scores as csv") | ||||
| with open(relevancy_score_path, "w") as f: | ||||
|     # we have the case that some metrics like like_count can be -1, the relevancy score therefore is NaN -> so we store it as '1.0' | ||||
|     tweets[["tweet_id", "relevance_score"]].to_csv(relevancy_score_path, header=True, index=False, na_rep=1.0) | ||||
|     tweets[["tweet_id", "relevance_score"]].to_csv( | ||||
|         relevancy_score_path, header=True, index=False, na_rep=1.0) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue