Compare commits

...

1 commit
main ... topics

Author SHA1 Message Date
7a8e01f9d1 WIP: adds site for displaying topics 2023-07-26 20:56:27 +02:00
4 changed files with 87 additions and 26 deletions

11
app.py
View file

@ -2,9 +2,13 @@ from pathlib import Path
from typing import List from typing import List
from shiny import App, ui, Inputs, Outputs, Session from shiny import App, ui, Inputs, Outputs, Session
from shiny.types import NavSetArg from shiny.types import NavSetArg
from src import mod_welcome, mod_searchable from src import mod_welcome, mod_searchable, mod_topics
from src.util import load_html_str_from_file from src.util import load_html_str_from_file
# by importing this module, the tweets are loaded into the tweet_store variable at program start
import src.data_loader
import os import os
@ -15,7 +19,9 @@ def nav_controls() -> List[NavSetArg]:
return [ return [
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"), ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
ui.nav(ui.h5("Analyse"), "Analyse"), ui.nav(ui.h5("Analyse"), "Analyse"),
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"), ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui(
"search_engine"), value="search_engine"),
ui.nav(ui.h5("Topics"), mod_topics.topics_ui("topics"), value="topics"),
ui.nav_control( ui.nav_control(
ui.a( ui.a(
ui.h5("AG-Link"), ui.h5("AG-Link"),
@ -60,6 +66,7 @@ app_ui = ui.page_navbar(
def server(input: Inputs, output: Outputs, session: Session): def server(input: Inputs, output: Outputs, session: Session):
mod_welcome.welcome_server("intro") mod_welcome.welcome_server("intro")
mod_searchable.searchable_server("search_engine") mod_searchable.searchable_server("search_engine")
mod_topics.topics_server("topics")
static_dir = Path(__file__).parent / "www" static_dir = Path(__file__).parent / "www"

26
src/data_loader.py Normal file
View file

@ -0,0 +1,26 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
class TweetStore():
tweets_path: str = "data/tweets_all_combined.csv"
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
def __init__(self):
print("Loading tweets from dataframe")
self.tweets = pd.read_csv(self.tweets_path)
print("Loading tfidf from file")
self.tfidf_matrix = None
with open(self.tfidf_matrix_path, "rb") as f:
self.tfidf_matrix = pickle.load(f)
self.tfidf_vectorizer: TfidfVectorizer = None
with open(self.tfidf_vectorizer_path, "rb") as f:
self.tfidf_vectorizer = pickle.load(f)
tweet_store = TweetStore()

View file

@ -1,15 +1,12 @@
from shiny import module, ui, render, Inputs, Outputs, Session from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import pickle
import re import re
tfidf_matrix_path = "data/tfidf_matrix.pckl" from src.data_loader import tweet_store
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevance_score_path = "data/tweet_relevance.json" relevance_score_path = "data/tweet_relevance.json"
tweets_path = "data/tweets_all_combined.csv"
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>' reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>' retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
@ -31,31 +28,21 @@ def replace_hastag(match):
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>' return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
print("Loading data from storage")
tweets = pd.read_csv(tweets_path)
relevance_score = pd.read_csv(relevance_score_path) relevance_score = pd.read_csv(relevance_score_path)
tfidf_matrix = None
with open(tfidf_matrix_path, "rb") as f:
tfidf_matrix = pickle.load(f)
tfidf_vectorizer: TfidfVectorizer = None tweet_store.tweets["relevance_score"] = relevance_score["relevance_score"]
with open(tfidf_vectorizer_path, "rb") as f: tweet_store.tweets = tweet_store.tweets.drop(["user_id", "measured_at"], axis=1)
tfidf_vectorizer = pickle.load(f)
tweets["relevance_score"] = relevance_score["relevance_score"]
tweets = tweets.drop(["user_id", "measured_at"], axis=1)
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
query_vec = tfidf_vectorizer.transform([query]) query_vec = tweet_store.tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() similarity = cosine_similarity(query_vec, tweet_store.tfidf_matrix).flatten()
filtered = np.where(similarity != 0)[0] filtered = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[filtered]) indices = np.argsort(-similarity[filtered])
correct_indices = filtered[indices] correct_indices = filtered[indices]
result = tweets.iloc[correct_indices] result = tweet_store.tweets.iloc[correct_indices]
if not len(result): if not len(result):
return None, 0 return None, 0
@ -81,10 +68,13 @@ def searchable_ui():
ui.h2("Tweet Suchmaschine"), ui.h2("Tweet Suchmaschine"),
ui.HTML("<hr>"), ui.HTML("<hr>"),
ui.row( ui.row(
ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), ui.column(6, ui.input_text("search_input", "Suche",
placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
ui.column(3, ui.column(3,
ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), ui.input_select("sorting_method", "Sortierung", {
ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), "score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
ui.input_select("tweet_count", "Ergebnisse", {
"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
style="justify-content:space-between;" style="justify-content:space-between;"
@ -138,7 +128,8 @@ def searchable_server(input: Inputs, output: Outputs, session: Session):
ui.row( ui.row(
ui.column(6, ui.HTML( ui.column(6, ui.HTML(
f"<b>{user_name}</b><a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "), f"<b>{user_name}</b><a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"), ui.column(6, ui.p(f"{row['created_at']}"),
style=style + "padding-top: 1.5em;"),
), ),
ui.row( ui.row(
ui.column(12, ui.HTML("<hr>"), ui.column(12, ui.HTML("<hr>"),

37
src/mod_topics.py Normal file
View file

@ -0,0 +1,37 @@
from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.decomposition import NMF
from src.data_loader import tweet_store
classes = 10
# Fit the NMF model
nmf = NMF(
n_components=classes,
random_state=42,
init=None,
beta_loss="frobenius",
alpha_W=0.0,
alpha_H="same",
l1_ratio=0.0,
).fit(tweet_store.tfidf_matrix)
# TODO: dont do this live -> load the feature_names and values from a pre-calculated list for each day
tfidf_feature_names = tweet_store.tfidf_vectorizer.get_feature_names_out()
print(tfidf_feature_names)
@ module.ui
def topics_ui():
return ui.div(
ui.h2("Tweet Topics"),
)
@ module.server
def topics_server(input: Inputs, output: Outputs, session: Session):
@ output
@ render.ui
def searchable_tweet_ui():
pass