from shiny import module, ui, render, Inputs, Outputs, Session from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import pickle tfidf_matrix_path = "data/tfidf_matrix.pckl" tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" relevance_score_path = "data/tweet_relevance.json" tweets_path = "data/tweets_all_combined.csv" reply_html_svg = '' retweet_html_svg = '' like_html_svg = '' print("Loading data from storage") tweets = pd.read_csv(tweets_path) relevance_score = pd.read_csv(relevance_score_path) tfidf_matrix = None with open(tfidf_matrix_path, "rb") as f: tfidf_matrix = pickle.load(f) tfidf_vectorizer: TfidfVectorizer = None with open(tfidf_vectorizer_path, "rb") as f: tfidf_vectorizer = pickle.load(f) tweets["relevance_score"] = relevance_score["relevance_score"] tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1) def search_query(query: str, limit: int = 5) -> pd.DataFrame: query_vec = tfidf_vectorizer.transform([query]) similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() filtered = np.where(similarity != 0)[0] indices = np.argsort(-similarity[filtered]) correct_indices = filtered[indices] result = tweets.iloc[correct_indices] if not len(result): return None overall = result['relevance_score'] * similarity[correct_indices] return result.loc[overall.sort_values(ascending=False).index].head(limit) @module.ui def searchable_ui(): return ui.div( ui.h2("Tweet Suchmaschine"), ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein", value="Leipzig"), ui.HTML("
"), ui.output_ui(id="searchable_tweet_ui"), ) @ module.server def searchable_server(input: Inputs, output: Outputs, session: Session): @output @render.ui def searchable_tweet_ui(): query = input.search_input() result_pd = search_query(query, 15) style = "text-align: center; padding-top: 0.5em;" tweet_ui = ui.page_fluid() if result_pd is None: return ui.div( ui.h5("Keine Ergebnisse gefunden!") ) # iterating over dataframe is bad but needed for idx, row in result_pd.iterrows(): tweet_ui.append( ui.div( ui.row( ui.column(9, ui.markdown( f"**{row['user_name']}** *@{row['handle']}*"), style=style), ui.column(3, ui.p(f"{row['created_at']}"), style=style), ), ui.row( ui.column(12, ui.HTML(str(row["tweet_text"]).replace( "\\n", "
")), style=style + "font-size: 20px; padding:1em;"), ), ui.row( ui.column(3, ui.HTML(reply_html_svg), ui.p( f"{row['reply_count']}"), style=style), ui.column(3, ui.HTML(retweet_html_svg), ui.p( f"{row['retweet_count']}"), style=style), ui.column(3, ui.HTML(like_html_svg), ui.p( f"{row['like_count']}"), style=style), # quote_count: . Indicates approximately how many times this Tweet has been quoted by Twitter users. Example: # TODO: use a nice svg for quote_count ui.column(3, ui.p(f"Quote Count: {row['quote_count']}"), style=style), ), style="border: 1px solid #954; margin-bottom: 1em;")) return tweet_ui