WIP: adds site for displaying topics
This commit is contained in:
		
							parent
							
								
									a7453d11d5
								
							
						
					
					
						commit
						7a8e01f9d1
					
				
					 4 changed files with 87 additions and 26 deletions
				
			
		
							
								
								
									
										11
									
								
								app.py
									
										
									
									
									
								
							
							
						
						
									
										11
									
								
								app.py
									
										
									
									
									
								
							|  | @ -2,9 +2,13 @@ from pathlib import Path | ||||||
| from typing import List | from typing import List | ||||||
| from shiny import App, ui, Inputs, Outputs, Session | from shiny import App, ui, Inputs, Outputs, Session | ||||||
| from shiny.types import NavSetArg | from shiny.types import NavSetArg | ||||||
| from src import mod_welcome, mod_searchable | from src import mod_welcome, mod_searchable, mod_topics | ||||||
| from src.util import load_html_str_from_file | from src.util import load_html_str_from_file | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # by importing this module, the tweets are loaded into the tweet_store variable at program start | ||||||
|  | import src.data_loader | ||||||
|  | 
 | ||||||
| import os | import os | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -15,7 +19,9 @@ def nav_controls() -> List[NavSetArg]: | ||||||
|     return [ |     return [ | ||||||
|         ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"), |         ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"), | ||||||
|         ui.nav(ui.h5("Analyse"), "Analyse"), |         ui.nav(ui.h5("Analyse"), "Analyse"), | ||||||
|         ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"), |         ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui( | ||||||
|  |             "search_engine"), value="search_engine"), | ||||||
|  |         ui.nav(ui.h5("Topics"), mod_topics.topics_ui("topics"), value="topics"), | ||||||
|         ui.nav_control( |         ui.nav_control( | ||||||
|             ui.a( |             ui.a( | ||||||
|                 ui.h5("AG-Link"), |                 ui.h5("AG-Link"), | ||||||
|  | @ -60,6 +66,7 @@ app_ui = ui.page_navbar( | ||||||
| def server(input: Inputs, output: Outputs, session: Session): | def server(input: Inputs, output: Outputs, session: Session): | ||||||
|     mod_welcome.welcome_server("intro") |     mod_welcome.welcome_server("intro") | ||||||
|     mod_searchable.searchable_server("search_engine") |     mod_searchable.searchable_server("search_engine") | ||||||
|  |     mod_topics.topics_server("topics") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| static_dir = Path(__file__).parent / "www" | static_dir = Path(__file__).parent / "www" | ||||||
|  |  | ||||||
							
								
								
									
										26
									
								
								src/data_loader.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/data_loader.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | ||||||
|  | import pandas as pd | ||||||
|  | from sklearn.feature_extraction.text import TfidfVectorizer | ||||||
|  | import pickle | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TweetStore(): | ||||||
|  | 
 | ||||||
|  |     tweets_path: str = "data/tweets_all_combined.csv" | ||||||
|  |     tfidf_matrix_path = "data/tfidf_matrix.pckl" | ||||||
|  |     tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" | ||||||
|  | 
 | ||||||
|  |     def __init__(self): | ||||||
|  |         print("Loading tweets from dataframe") | ||||||
|  |         self.tweets = pd.read_csv(self.tweets_path) | ||||||
|  | 
 | ||||||
|  |         print("Loading tfidf from file") | ||||||
|  |         self.tfidf_matrix = None | ||||||
|  |         with open(self.tfidf_matrix_path, "rb") as f: | ||||||
|  |             self.tfidf_matrix = pickle.load(f) | ||||||
|  | 
 | ||||||
|  |         self.tfidf_vectorizer: TfidfVectorizer = None | ||||||
|  |         with open(self.tfidf_vectorizer_path, "rb") as f: | ||||||
|  |             self.tfidf_vectorizer = pickle.load(f) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | tweet_store = TweetStore() | ||||||
|  | @ -1,15 +1,12 @@ | ||||||
| from shiny import module, ui, render, Inputs, Outputs, Session | from shiny import module, ui, render, Inputs, Outputs, Session | ||||||
| from sklearn.feature_extraction.text import TfidfVectorizer |  | ||||||
| from sklearn.metrics.pairwise import cosine_similarity | from sklearn.metrics.pairwise import cosine_similarity | ||||||
| import pandas as pd | import pandas as pd | ||||||
| import numpy as np | import numpy as np | ||||||
| import pickle |  | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| tfidf_matrix_path = "data/tfidf_matrix.pckl" | from src.data_loader import tweet_store | ||||||
| tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" | 
 | ||||||
| relevance_score_path = "data/tweet_relevance.json" | relevance_score_path = "data/tweet_relevance.json" | ||||||
| tweets_path = "data/tweets_all_combined.csv" |  | ||||||
| 
 | 
 | ||||||
| reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>' | reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>' | ||||||
| retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>' | retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>' | ||||||
|  | @ -31,31 +28,21 @@ def replace_hastag(match): | ||||||
|     return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>' |     return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| print("Loading data from storage") |  | ||||||
| tweets = pd.read_csv(tweets_path) |  | ||||||
| relevance_score = pd.read_csv(relevance_score_path) | relevance_score = pd.read_csv(relevance_score_path) | ||||||
| 
 | 
 | ||||||
| tfidf_matrix = None |  | ||||||
| with open(tfidf_matrix_path, "rb") as f: |  | ||||||
|     tfidf_matrix = pickle.load(f) |  | ||||||
| 
 | 
 | ||||||
| tfidf_vectorizer: TfidfVectorizer = None | tweet_store.tweets["relevance_score"] = relevance_score["relevance_score"] | ||||||
| with open(tfidf_vectorizer_path, "rb") as f: | tweet_store.tweets = tweet_store.tweets.drop(["user_id", "measured_at"], axis=1) | ||||||
|     tfidf_vectorizer = pickle.load(f) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| tweets["relevance_score"] = relevance_score["relevance_score"] |  | ||||||
| tweets = tweets.drop(["user_id", "measured_at"], axis=1) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): | def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): | ||||||
|     query_vec = tfidf_vectorizer.transform([query]) |     query_vec = tweet_store.tfidf_vectorizer.transform([query]) | ||||||
|     similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() |     similarity = cosine_similarity(query_vec, tweet_store.tfidf_matrix).flatten() | ||||||
| 
 | 
 | ||||||
|     filtered = np.where(similarity != 0)[0] |     filtered = np.where(similarity != 0)[0] | ||||||
|     indices = np.argsort(-similarity[filtered]) |     indices = np.argsort(-similarity[filtered]) | ||||||
|     correct_indices = filtered[indices] |     correct_indices = filtered[indices] | ||||||
|     result = tweets.iloc[correct_indices] |     result = tweet_store.tweets.iloc[correct_indices] | ||||||
| 
 | 
 | ||||||
|     if not len(result): |     if not len(result): | ||||||
|         return None, 0 |         return None, 0 | ||||||
|  | @ -81,10 +68,13 @@ def searchable_ui(): | ||||||
|         ui.h2("Tweet Suchmaschine"), |         ui.h2("Tweet Suchmaschine"), | ||||||
|         ui.HTML("<hr>"), |         ui.HTML("<hr>"), | ||||||
|         ui.row( |         ui.row( | ||||||
|             ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), |             ui.column(6, ui.input_text("search_input", "Suche", | ||||||
|  |                       placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), | ||||||
|             ui.column(3, |             ui.column(3, | ||||||
|                       ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), |                       ui.input_select("sorting_method", "Sortierung", { | ||||||
|                       ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), |                                       "score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), | ||||||
|  |                       ui.input_select("tweet_count", "Ergebnisse", { | ||||||
|  |                                       "5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), | ||||||
|                       style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), |                       style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), | ||||||
|             style="justify-content:space-between;" |             style="justify-content:space-between;" | ||||||
| 
 | 
 | ||||||
|  | @ -138,7 +128,8 @@ def searchable_server(input: Inputs, output: Outputs, session: Session): | ||||||
|                     ui.row( |                     ui.row( | ||||||
|                         ui.column(6, ui.HTML( |                         ui.column(6, ui.HTML( | ||||||
|                             f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "), |                             f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "), | ||||||
|                         ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"), |                         ui.column(6, ui.p(f"{row['created_at']}"), | ||||||
|  |                                   style=style + "padding-top: 1.5em;"), | ||||||
|                     ), |                     ), | ||||||
|                     ui.row( |                     ui.row( | ||||||
|                         ui.column(12, ui.HTML("<hr>"), |                         ui.column(12, ui.HTML("<hr>"), | ||||||
|  |  | ||||||
							
								
								
									
										37
									
								
								src/mod_topics.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								src/mod_topics.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | from shiny import module, ui, render, Inputs, Outputs, Session | ||||||
|  | 
 | ||||||
|  | from sklearn.decomposition import NMF | ||||||
|  | from src.data_loader import tweet_store | ||||||
|  | 
 | ||||||
|  | classes = 10 | ||||||
|  | 
 | ||||||
|  | # Fit the NMF model | ||||||
|  | nmf = NMF( | ||||||
|  |     n_components=classes, | ||||||
|  |     random_state=42, | ||||||
|  |     init=None, | ||||||
|  |     beta_loss="frobenius", | ||||||
|  |     alpha_W=0.0, | ||||||
|  |     alpha_H="same", | ||||||
|  |     l1_ratio=0.0, | ||||||
|  | ).fit(tweet_store.tfidf_matrix) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # TODO: dont do this live -> load the feature_names and values from a pre-calculated list for each day | ||||||
|  | tfidf_feature_names = tweet_store.tfidf_vectorizer.get_feature_names_out() | ||||||
|  | print(tfidf_feature_names) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @ module.ui | ||||||
|  | def topics_ui(): | ||||||
|  |     return ui.div( | ||||||
|  |         ui.h2("Tweet Topics"), | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @ module.server | ||||||
|  | def topics_server(input: Inputs, output: Outputs, session: Session): | ||||||
|  |     @ output | ||||||
|  |     @ render.ui | ||||||
|  |     def searchable_tweet_ui(): | ||||||
|  |         pass | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue