WIP: adds site for displaying topics
This commit is contained in:
		
							parent
							
								
									a7453d11d5
								
							
						
					
					
						commit
						7a8e01f9d1
					
				
					 4 changed files with 87 additions and 26 deletions
				
			
		
							
								
								
									
										11
									
								
								app.py
									
										
									
									
									
								
							
							
						
						
									
										11
									
								
								app.py
									
										
									
									
									
								
							|  | @ -2,9 +2,13 @@ from pathlib import Path | |||
| from typing import List | ||||
| from shiny import App, ui, Inputs, Outputs, Session | ||||
| from shiny.types import NavSetArg | ||||
| from src import mod_welcome, mod_searchable | ||||
| from src import mod_welcome, mod_searchable, mod_topics | ||||
| from src.util import load_html_str_from_file | ||||
| 
 | ||||
| 
 | ||||
| # by importing this module, the tweets are loaded into the tweet_store variable at program start | ||||
| import src.data_loader | ||||
| 
 | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
|  | @ -15,7 +19,9 @@ def nav_controls() -> List[NavSetArg]: | |||
|     return [ | ||||
|         ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"), | ||||
|         ui.nav(ui.h5("Analyse"), "Analyse"), | ||||
|         ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"), | ||||
|         ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui( | ||||
|             "search_engine"), value="search_engine"), | ||||
|         ui.nav(ui.h5("Topics"), mod_topics.topics_ui("topics"), value="topics"), | ||||
|         ui.nav_control( | ||||
|             ui.a( | ||||
|                 ui.h5("AG-Link"), | ||||
|  | @ -60,6 +66,7 @@ app_ui = ui.page_navbar( | |||
| def server(input: Inputs, output: Outputs, session: Session): | ||||
|     mod_welcome.welcome_server("intro") | ||||
|     mod_searchable.searchable_server("search_engine") | ||||
|     mod_topics.topics_server("topics") | ||||
| 
 | ||||
| 
 | ||||
| static_dir = Path(__file__).parent / "www" | ||||
|  |  | |||
							
								
								
									
										26
									
								
								src/data_loader.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/data_loader.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| import pandas as pd | ||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||
| import pickle | ||||
| 
 | ||||
| 
 | ||||
| class TweetStore(): | ||||
| 
 | ||||
|     tweets_path: str = "data/tweets_all_combined.csv" | ||||
|     tfidf_matrix_path = "data/tfidf_matrix.pckl" | ||||
|     tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         print("Loading tweets from dataframe") | ||||
|         self.tweets = pd.read_csv(self.tweets_path) | ||||
| 
 | ||||
|         print("Loading tfidf from file") | ||||
|         self.tfidf_matrix = None | ||||
|         with open(self.tfidf_matrix_path, "rb") as f: | ||||
|             self.tfidf_matrix = pickle.load(f) | ||||
| 
 | ||||
|         self.tfidf_vectorizer: TfidfVectorizer = None | ||||
|         with open(self.tfidf_vectorizer_path, "rb") as f: | ||||
|             self.tfidf_vectorizer = pickle.load(f) | ||||
| 
 | ||||
| 
 | ||||
| tweet_store = TweetStore() | ||||
|  | @ -1,15 +1,12 @@ | |||
| from shiny import module, ui, render, Inputs, Outputs, Session | ||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||
| from sklearn.metrics.pairwise import cosine_similarity | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import pickle | ||||
| import re | ||||
| 
 | ||||
| tfidf_matrix_path = "data/tfidf_matrix.pckl" | ||||
| tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" | ||||
| from src.data_loader import tweet_store | ||||
| 
 | ||||
| relevance_score_path = "data/tweet_relevance.json" | ||||
| tweets_path = "data/tweets_all_combined.csv" | ||||
| 
 | ||||
| reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>' | ||||
| retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>' | ||||
|  | @ -31,31 +28,21 @@ def replace_hastag(match): | |||
|     return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>' | ||||
| 
 | ||||
| 
 | ||||
| print("Loading data from storage") | ||||
| tweets = pd.read_csv(tweets_path) | ||||
| relevance_score = pd.read_csv(relevance_score_path) | ||||
| 
 | ||||
| tfidf_matrix = None | ||||
| with open(tfidf_matrix_path, "rb") as f: | ||||
|     tfidf_matrix = pickle.load(f) | ||||
| 
 | ||||
| tfidf_vectorizer: TfidfVectorizer = None | ||||
| with open(tfidf_vectorizer_path, "rb") as f: | ||||
|     tfidf_vectorizer = pickle.load(f) | ||||
| 
 | ||||
| 
 | ||||
| tweets["relevance_score"] = relevance_score["relevance_score"] | ||||
| tweets = tweets.drop(["user_id", "measured_at"], axis=1) | ||||
| tweet_store.tweets["relevance_score"] = relevance_score["relevance_score"] | ||||
| tweet_store.tweets = tweet_store.tweets.drop(["user_id", "measured_at"], axis=1) | ||||
| 
 | ||||
| 
 | ||||
| def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): | ||||
|     query_vec = tfidf_vectorizer.transform([query]) | ||||
|     similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() | ||||
|     query_vec = tweet_store.tfidf_vectorizer.transform([query]) | ||||
|     similarity = cosine_similarity(query_vec, tweet_store.tfidf_matrix).flatten() | ||||
| 
 | ||||
|     filtered = np.where(similarity != 0)[0] | ||||
|     indices = np.argsort(-similarity[filtered]) | ||||
|     correct_indices = filtered[indices] | ||||
|     result = tweets.iloc[correct_indices] | ||||
|     result = tweet_store.tweets.iloc[correct_indices] | ||||
| 
 | ||||
|     if not len(result): | ||||
|         return None, 0 | ||||
|  | @ -81,10 +68,13 @@ def searchable_ui(): | |||
|         ui.h2("Tweet Suchmaschine"), | ||||
|         ui.HTML("<hr>"), | ||||
|         ui.row( | ||||
|             ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), | ||||
|             ui.column(6, ui.input_text("search_input", "Suche", | ||||
|                       placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), | ||||
|             ui.column(3, | ||||
|                       ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), | ||||
|                       ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), | ||||
|                       ui.input_select("sorting_method", "Sortierung", { | ||||
|                                       "score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), | ||||
|                       ui.input_select("tweet_count", "Ergebnisse", { | ||||
|                                       "5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), | ||||
|                       style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), | ||||
|             style="justify-content:space-between;" | ||||
| 
 | ||||
|  | @ -138,7 +128,8 @@ def searchable_server(input: Inputs, output: Outputs, session: Session): | |||
|                     ui.row( | ||||
|                         ui.column(6, ui.HTML( | ||||
|                             f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "), | ||||
|                         ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"), | ||||
|                         ui.column(6, ui.p(f"{row['created_at']}"), | ||||
|                                   style=style + "padding-top: 1.5em;"), | ||||
|                     ), | ||||
|                     ui.row( | ||||
|                         ui.column(12, ui.HTML("<hr>"), | ||||
|  |  | |||
							
								
								
									
										37
									
								
								src/mod_topics.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								src/mod_topics.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| from shiny import module, ui, render, Inputs, Outputs, Session | ||||
| 
 | ||||
| from sklearn.decomposition import NMF | ||||
| from src.data_loader import tweet_store | ||||
| 
 | ||||
| classes = 10 | ||||
| 
 | ||||
| # Fit the NMF model | ||||
| nmf = NMF( | ||||
|     n_components=classes, | ||||
|     random_state=42, | ||||
|     init=None, | ||||
|     beta_loss="frobenius", | ||||
|     alpha_W=0.0, | ||||
|     alpha_H="same", | ||||
|     l1_ratio=0.0, | ||||
| ).fit(tweet_store.tfidf_matrix) | ||||
| 
 | ||||
| 
 | ||||
| # TODO: dont do this live -> load the feature_names and values from a pre-calculated list for each day | ||||
| tfidf_feature_names = tweet_store.tfidf_vectorizer.get_feature_names_out() | ||||
| print(tfidf_feature_names) | ||||
| 
 | ||||
| 
 | ||||
| @ module.ui | ||||
| def topics_ui(): | ||||
|     return ui.div( | ||||
|         ui.h2("Tweet Topics"), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @ module.server | ||||
| def topics_server(input: Inputs, output: Outputs, session: Session): | ||||
|     @ output | ||||
|     @ render.ui | ||||
|     def searchable_tweet_ui(): | ||||
|         pass | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue