from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevance_score_path = "data/tweet_relevance.json"
tweets_path = "data/tweets_all_combined.csv"
reply_html_svg = ''
retweet_html_svg = ''
like_html_svg = ''
print("Loading data from storage")
tweets = pd.read_csv(tweets_path)
relevance_score = pd.read_csv(relevance_score_path)
tfidf_matrix = None
with open(tfidf_matrix_path, "rb") as f:
tfidf_matrix = pickle.load(f)
tfidf_vectorizer: TfidfVectorizer = None
with open(tfidf_vectorizer_path, "rb") as f:
tfidf_vectorizer = pickle.load(f)
tweets["relevance_score"] = relevance_score["relevance_score"]
tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1)
def search_query(query: str, limit: int = 5) -> pd.DataFrame:
query_vec = tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
filtered = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[filtered])
correct_indices = filtered[indices]
result = tweets.iloc[correct_indices]
if not len(result):
return None
overall = result['relevance_score'] * similarity[correct_indices]
return result.loc[overall.sort_values(ascending=False).index].head(limit)
@module.ui
def searchable_ui():
return ui.div(
ui.h2("Tweet Suchmaschine"),
ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein", value="Leipzig"),
ui.HTML("
"),
ui.output_ui(id="searchable_tweet_ui"),
)
@ module.server
def searchable_server(input: Inputs, output: Outputs, session: Session):
@output
@render.ui
def searchable_tweet_ui():
query = input.search_input()
result_pd = search_query(query, 15)
style = "text-align: center; padding-top: 0.5em;"
tweet_ui = ui.page_fluid()
if result_pd is None:
return ui.div(
ui.h5("Keine Ergebnisse gefunden!")
)
# iterating over dataframe is bad but needed
for idx, row in result_pd.iterrows():
tweet_ui.append(
ui.div(
ui.row(
ui.column(9, ui.markdown(
f"**{row['user_name']}** *@{row['handle']}*"), style=style),
ui.column(3, ui.p(f"{row['created_at']}"), style=style),
),
ui.row(
ui.column(12, ui.HTML(str(row["tweet_text"]).replace(
"\\n", "
")), style=style + "font-size: 20px; padding:1em;"),
),
ui.row(
ui.column(3, ui.HTML(reply_html_svg), ui.p(
f"{row['reply_count']}"), style=style),
ui.column(3, ui.HTML(retweet_html_svg), ui.p(
f"{row['retweet_count']}"), style=style),
ui.column(3, ui.HTML(like_html_svg), ui.p(
f"{row['like_count']}"), style=style),
# quote_count: . Indicates approximately how many times this Tweet has been quoted by Twitter users. Example:
# TODO: use a nice svg for quote_count
ui.column(3, ui.p(f"Quote Count: {row['quote_count']}"), style=style),
), style="border: 1px solid #954; margin-bottom: 1em;"))
return tweet_ui