WIP: adds site for displaying topics

This commit is contained in:
procrastimax 2023-07-26 20:56:27 +02:00
parent a7453d11d5
commit 7a8e01f9d1
4 changed files with 87 additions and 26 deletions

11
app.py
View File

@ -2,9 +2,13 @@ from pathlib import Path
from typing import List
from shiny import App, ui, Inputs, Outputs, Session
from shiny.types import NavSetArg
from src import mod_welcome, mod_searchable
from src import mod_welcome, mod_searchable, mod_topics
from src.util import load_html_str_from_file
# by importing this module, the tweets are loaded into the tweet_store variable at program start
import src.data_loader
import os
@ -15,7 +19,9 @@ def nav_controls() -> List[NavSetArg]:
return [
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
ui.nav(ui.h5("Analyse"), "Analyse"),
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"),
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui(
"search_engine"), value="search_engine"),
ui.nav(ui.h5("Topics"), mod_topics.topics_ui("topics"), value="topics"),
ui.nav_control(
ui.a(
ui.h5("AG-Link"),
@ -60,6 +66,7 @@ app_ui = ui.page_navbar(
def server(input: Inputs, output: Outputs, session: Session):
mod_welcome.welcome_server("intro")
mod_searchable.searchable_server("search_engine")
mod_topics.topics_server("topics")
static_dir = Path(__file__).parent / "www"

26
src/data_loader.py Normal file
View File

@ -0,0 +1,26 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
class TweetStore():
tweets_path: str = "data/tweets_all_combined.csv"
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
def __init__(self):
print("Loading tweets from dataframe")
self.tweets = pd.read_csv(self.tweets_path)
print("Loading tfidf from file")
self.tfidf_matrix = None
with open(self.tfidf_matrix_path, "rb") as f:
self.tfidf_matrix = pickle.load(f)
self.tfidf_vectorizer: TfidfVectorizer = None
with open(self.tfidf_vectorizer_path, "rb") as f:
self.tfidf_vectorizer = pickle.load(f)
tweet_store = TweetStore()

View File

@ -1,15 +1,12 @@
from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
import re
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
from src.data_loader import tweet_store
relevance_score_path = "data/tweet_relevance.json"
tweets_path = "data/tweets_all_combined.csv"
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
@ -31,31 +28,21 @@ def replace_hastag(match):
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
print("Loading data from storage")
tweets = pd.read_csv(tweets_path)
relevance_score = pd.read_csv(relevance_score_path)
tfidf_matrix = None
with open(tfidf_matrix_path, "rb") as f:
tfidf_matrix = pickle.load(f)
tfidf_vectorizer: TfidfVectorizer = None
with open(tfidf_vectorizer_path, "rb") as f:
tfidf_vectorizer = pickle.load(f)
tweets["relevance_score"] = relevance_score["relevance_score"]
tweets = tweets.drop(["user_id", "measured_at"], axis=1)
tweet_store.tweets["relevance_score"] = relevance_score["relevance_score"]
tweet_store.tweets = tweet_store.tweets.drop(["user_id", "measured_at"], axis=1)
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
query_vec = tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
query_vec = tweet_store.tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tweet_store.tfidf_matrix).flatten()
filtered = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[filtered])
correct_indices = filtered[indices]
result = tweets.iloc[correct_indices]
result = tweet_store.tweets.iloc[correct_indices]
if not len(result):
return None, 0
@ -81,10 +68,13 @@ def searchable_ui():
ui.h2("Tweet Suchmaschine"),
ui.HTML("<hr>"),
ui.row(
ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
ui.column(6, ui.input_text("search_input", "Suche",
placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
ui.column(3,
ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
ui.input_select("sorting_method", "Sortierung", {
"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
ui.input_select("tweet_count", "Ergebnisse", {
"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
style="justify-content:space-between;"
@ -138,7 +128,8 @@ def searchable_server(input: Inputs, output: Outputs, session: Session):
ui.row(
ui.column(6, ui.HTML(
f"<b>{user_name}</b><a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"),
ui.column(6, ui.p(f"{row['created_at']}"),
style=style + "padding-top: 1.5em;"),
),
ui.row(
ui.column(12, ui.HTML("<hr>"),

37
src/mod_topics.py Normal file
View File

@ -0,0 +1,37 @@
from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.decomposition import NMF
from src.data_loader import tweet_store
classes = 10
# Fit the NMF model
nmf = NMF(
n_components=classes,
random_state=42,
init=None,
beta_loss="frobenius",
alpha_W=0.0,
alpha_H="same",
l1_ratio=0.0,
).fit(tweet_store.tfidf_matrix)
# TODO: dont do this live -> load the feature_names and values from a pre-calculated list for each day
tfidf_feature_names = tweet_store.tfidf_vectorizer.get_feature_names_out()
print(tfidf_feature_names)
@ module.ui
def topics_ui():
return ui.div(
ui.h2("Tweet Topics"),
)
@ module.server
def topics_server(input: Inputs, output: Outputs, session: Session):
@ output
@ render.ui
def searchable_tweet_ui():
pass