Compare commits

..

3 commits

6 changed files with 484889 additions and 6 deletions

3
.gitignore vendored
View file

@ -160,3 +160,6 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
data/tfidf_matrix.pckl
data/tfidf_vectorizer.pckl

10
app.py
View file

@ -2,7 +2,7 @@ from pathlib import Path
from typing import List from typing import List
from shiny import App, ui, Inputs, Outputs, Session from shiny import App, ui, Inputs, Outputs, Session
from shiny.types import NavSetArg from shiny.types import NavSetArg
from src import mod_welcome from src import mod_welcome, mod_searchable
from src.util import load_html_str_from_file from src.util import load_html_str_from_file
import os import os
@ -13,8 +13,9 @@ footer_html: str = load_html_str_from_file(os.path.join("www", "footer.html"))
def nav_controls() -> List[NavSetArg]: def nav_controls() -> List[NavSetArg]:
return [ return [
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("Intro"), value="intro"), ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
ui.nav(ui.h5("Datensatz Analyse"), "Datensatz Analyse"), ui.nav(ui.h5("Analyse"), "Analyse"),
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"),
ui.nav_control( ui.nav_control(
ui.a( ui.a(
ui.h5("AG-Link"), ui.h5("AG-Link"),
@ -57,7 +58,8 @@ app_ui = ui.page_navbar(
def server(input: Inputs, output: Outputs, session: Session): def server(input: Inputs, output: Outputs, session: Session):
mod_welcome.welcome_server("Intro") mod_welcome.welcome_server("intro")
mod_searchable.searchable_server("search_engine")
static_dir = Path(__file__).parent / "www" static_dir = Path(__file__).parent / "www"

151691
data/tweet_relevance.json Normal file

File diff suppressed because it is too large Load diff

333095
data/tweets_all_combined.csv Normal file

File diff suppressed because it is too large Load diff

93
src/mod_searchable.py Normal file
View file

@ -0,0 +1,93 @@
from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevance_score_path = "data/tweet_relevance.json"
tweets_path = "data/tweets_all_combined.csv"
print("Loading data from storage")
tweets = pd.read_csv(tweets_path)
relevance_score = pd.read_csv(relevance_score_path)
tfidf_matrix = None
with open(tfidf_matrix_path, "rb") as f:
tfidf_matrix = pickle.load(f)
tfidf_vectorizer: TfidfVectorizer = None
with open(tfidf_vectorizer_path, "rb") as f:
tfidf_vectorizer = pickle.load(f)
tweets["relevance_score"] = relevance_score["relevance_score"]
tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1)
def search_query(query: str, limit: int = 5) -> pd.DataFrame:
query_vec = tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
filtered = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[filtered])
correct_indices = filtered[indices]
result = tweets.iloc[correct_indices]
if not len(result):
return None
overall = result['relevance_score'] * similarity[correct_indices]
return result.loc[overall.sort_values(ascending=False).index].head(limit)
@module.ui
def searchable_ui():
return ui.div(
ui.h2("Tweet Suchmaschine"),
ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein"),
ui.HTML("<br>"),
ui.output_ui(id="searchable_tweet_ui"),
)
@ module.server
def searchable_server(input: Inputs, output: Outputs, session: Session):
@output
@render.ui
def searchable_tweet_ui():
query = input.search_input()
result_pd = search_query(query, 15)
style = "text-align: center; padding-top: 0.5em;"
tweet_ui = ui.page_fluid()
if result_pd is None:
return ui.div(
ui.h5("Keine Ergebnisse gefunden!")
)
# iterating over dataframe is bad but needed
for idx, row in result_pd.iterrows():
tweet_ui.append(
ui.div(
ui.row(
ui.column(9, ui.markdown(f"**{row['user_name']}***@{row['handle']}*"), style=style),
ui.column(3, ui.p(f"{row['created_at']}"), style=style),
),
ui.row(
ui.column(12, ui.HTML(str(row["tweet_text"]).replace("\\n", "<br>")), style=style + "font-size: 20px; padding:1em;"),
),
ui.row(
ui.column(3, ui.p(f"👍 {row['like_count']}"), style=style),
ui.column(3, ui.p(f"{row['retweet_count']}"), style=style),
ui.column(3, ui.p(f"{row['reply_count']}"), style=style),
ui.column(3, ui.p(f"💬 {row['quote_count']}"), style=style),
), style="border: 1px solid #954; margin-bottom: 1em;"))
return tweet_ui

View file

@ -18,7 +18,6 @@ def welcome_ui():
[0]: https://ag-link.xyz [0]: https://ag-link.xyz
"""), """),
# ui.output_text("dataset_infos"),
ui.output_ui("dataset_infos"), ui.output_ui("dataset_infos"),
ui.h3("Ursprung der Idee"), ui.h3("Ursprung der Idee"),
ui.markdown(""" ui.markdown("""
@ -52,7 +51,7 @@ with open("data/general_analysis_results.json", "r") as f:
@ module.server @ module.server
def welcome_server(input, output, session, starting_value=0): def welcome_server(input, output, session):
@output @output
@render.ui @render.ui
def dataset_infos(): def dataset_infos():