Compare commits
1 commit
Author | SHA1 | Date | |
---|---|---|---|
7a8e01f9d1 |
4 changed files with 87 additions and 26 deletions
11
app.py
11
app.py
|
@ -2,9 +2,13 @@ from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
from shiny import App, ui, Inputs, Outputs, Session
|
from shiny import App, ui, Inputs, Outputs, Session
|
||||||
from shiny.types import NavSetArg
|
from shiny.types import NavSetArg
|
||||||
from src import mod_welcome, mod_searchable
|
from src import mod_welcome, mod_searchable, mod_topics
|
||||||
from src.util import load_html_str_from_file
|
from src.util import load_html_str_from_file
|
||||||
|
|
||||||
|
|
||||||
|
# by importing this module, the tweets are loaded into the tweet_store variable at program start
|
||||||
|
import src.data_loader
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,7 +19,9 @@ def nav_controls() -> List[NavSetArg]:
|
||||||
return [
|
return [
|
||||||
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
|
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
|
||||||
ui.nav(ui.h5("Analyse"), "Analyse"),
|
ui.nav(ui.h5("Analyse"), "Analyse"),
|
||||||
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"),
|
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui(
|
||||||
|
"search_engine"), value="search_engine"),
|
||||||
|
ui.nav(ui.h5("Topics"), mod_topics.topics_ui("topics"), value="topics"),
|
||||||
ui.nav_control(
|
ui.nav_control(
|
||||||
ui.a(
|
ui.a(
|
||||||
ui.h5("AG-Link"),
|
ui.h5("AG-Link"),
|
||||||
|
@ -60,6 +66,7 @@ app_ui = ui.page_navbar(
|
||||||
def server(input: Inputs, output: Outputs, session: Session):
|
def server(input: Inputs, output: Outputs, session: Session):
|
||||||
mod_welcome.welcome_server("intro")
|
mod_welcome.welcome_server("intro")
|
||||||
mod_searchable.searchable_server("search_engine")
|
mod_searchable.searchable_server("search_engine")
|
||||||
|
mod_topics.topics_server("topics")
|
||||||
|
|
||||||
|
|
||||||
static_dir = Path(__file__).parent / "www"
|
static_dir = Path(__file__).parent / "www"
|
||||||
|
|
26
src/data_loader.py
Normal file
26
src/data_loader.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
class TweetStore():
|
||||||
|
|
||||||
|
tweets_path: str = "data/tweets_all_combined.csv"
|
||||||
|
tfidf_matrix_path = "data/tfidf_matrix.pckl"
|
||||||
|
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
print("Loading tweets from dataframe")
|
||||||
|
self.tweets = pd.read_csv(self.tweets_path)
|
||||||
|
|
||||||
|
print("Loading tfidf from file")
|
||||||
|
self.tfidf_matrix = None
|
||||||
|
with open(self.tfidf_matrix_path, "rb") as f:
|
||||||
|
self.tfidf_matrix = pickle.load(f)
|
||||||
|
|
||||||
|
self.tfidf_vectorizer: TfidfVectorizer = None
|
||||||
|
with open(self.tfidf_vectorizer_path, "rb") as f:
|
||||||
|
self.tfidf_vectorizer = pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
tweet_store = TweetStore()
|
|
@ -1,15 +1,12 @@
|
||||||
from shiny import module, ui, render, Inputs, Outputs, Session
|
from shiny import module, ui, render, Inputs, Outputs, Session
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pickle
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
tfidf_matrix_path = "data/tfidf_matrix.pckl"
|
from src.data_loader import tweet_store
|
||||||
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
|
|
||||||
relevance_score_path = "data/tweet_relevance.json"
|
relevance_score_path = "data/tweet_relevance.json"
|
||||||
tweets_path = "data/tweets_all_combined.csv"
|
|
||||||
|
|
||||||
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
|
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
|
||||||
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
|
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
|
||||||
|
@ -31,31 +28,21 @@ def replace_hastag(match):
|
||||||
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
|
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
|
||||||
|
|
||||||
|
|
||||||
print("Loading data from storage")
|
|
||||||
tweets = pd.read_csv(tweets_path)
|
|
||||||
relevance_score = pd.read_csv(relevance_score_path)
|
relevance_score = pd.read_csv(relevance_score_path)
|
||||||
|
|
||||||
tfidf_matrix = None
|
|
||||||
with open(tfidf_matrix_path, "rb") as f:
|
|
||||||
tfidf_matrix = pickle.load(f)
|
|
||||||
|
|
||||||
tfidf_vectorizer: TfidfVectorizer = None
|
tweet_store.tweets["relevance_score"] = relevance_score["relevance_score"]
|
||||||
with open(tfidf_vectorizer_path, "rb") as f:
|
tweet_store.tweets = tweet_store.tweets.drop(["user_id", "measured_at"], axis=1)
|
||||||
tfidf_vectorizer = pickle.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
tweets["relevance_score"] = relevance_score["relevance_score"]
|
|
||||||
tweets = tweets.drop(["user_id", "measured_at"], axis=1)
|
|
||||||
|
|
||||||
|
|
||||||
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
|
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
|
||||||
query_vec = tfidf_vectorizer.transform([query])
|
query_vec = tweet_store.tfidf_vectorizer.transform([query])
|
||||||
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
|
similarity = cosine_similarity(query_vec, tweet_store.tfidf_matrix).flatten()
|
||||||
|
|
||||||
filtered = np.where(similarity != 0)[0]
|
filtered = np.where(similarity != 0)[0]
|
||||||
indices = np.argsort(-similarity[filtered])
|
indices = np.argsort(-similarity[filtered])
|
||||||
correct_indices = filtered[indices]
|
correct_indices = filtered[indices]
|
||||||
result = tweets.iloc[correct_indices]
|
result = tweet_store.tweets.iloc[correct_indices]
|
||||||
|
|
||||||
if not len(result):
|
if not len(result):
|
||||||
return None, 0
|
return None, 0
|
||||||
|
@ -81,10 +68,13 @@ def searchable_ui():
|
||||||
ui.h2("Tweet Suchmaschine"),
|
ui.h2("Tweet Suchmaschine"),
|
||||||
ui.HTML("<hr>"),
|
ui.HTML("<hr>"),
|
||||||
ui.row(
|
ui.row(
|
||||||
ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
|
ui.column(6, ui.input_text("search_input", "Suche",
|
||||||
|
placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
|
||||||
ui.column(3,
|
ui.column(3,
|
||||||
ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
|
ui.input_select("sorting_method", "Sortierung", {
|
||||||
ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
|
"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
|
||||||
|
ui.input_select("tweet_count", "Ergebnisse", {
|
||||||
|
"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
|
||||||
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
|
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
|
||||||
style="justify-content:space-between;"
|
style="justify-content:space-between;"
|
||||||
|
|
||||||
|
@ -138,7 +128,8 @@ def searchable_server(input: Inputs, output: Outputs, session: Session):
|
||||||
ui.row(
|
ui.row(
|
||||||
ui.column(6, ui.HTML(
|
ui.column(6, ui.HTML(
|
||||||
f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
|
f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
|
||||||
ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"),
|
ui.column(6, ui.p(f"{row['created_at']}"),
|
||||||
|
style=style + "padding-top: 1.5em;"),
|
||||||
),
|
),
|
||||||
ui.row(
|
ui.row(
|
||||||
ui.column(12, ui.HTML("<hr>"),
|
ui.column(12, ui.HTML("<hr>"),
|
||||||
|
|
37
src/mod_topics.py
Normal file
37
src/mod_topics.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
from shiny import module, ui, render, Inputs, Outputs, Session
|
||||||
|
|
||||||
|
from sklearn.decomposition import NMF
|
||||||
|
from src.data_loader import tweet_store
|
||||||
|
|
||||||
|
classes = 10
|
||||||
|
|
||||||
|
# Fit the NMF model
|
||||||
|
nmf = NMF(
|
||||||
|
n_components=classes,
|
||||||
|
random_state=42,
|
||||||
|
init=None,
|
||||||
|
beta_loss="frobenius",
|
||||||
|
alpha_W=0.0,
|
||||||
|
alpha_H="same",
|
||||||
|
l1_ratio=0.0,
|
||||||
|
).fit(tweet_store.tfidf_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: dont do this live -> load the feature_names and values from a pre-calculated list for each day
|
||||||
|
tfidf_feature_names = tweet_store.tfidf_vectorizer.get_feature_names_out()
|
||||||
|
print(tfidf_feature_names)
|
||||||
|
|
||||||
|
|
||||||
|
@ module.ui
|
||||||
|
def topics_ui():
|
||||||
|
return ui.div(
|
||||||
|
ui.h2("Tweet Topics"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ module.server
|
||||||
|
def topics_server(input: Inputs, output: Outputs, session: Session):
|
||||||
|
@ output
|
||||||
|
@ render.ui
|
||||||
|
def searchable_tweet_ui():
|
||||||
|
pass
|
Loading…
Reference in a new issue