Compare commits

...

6 commits

6 changed files with 484971 additions and 7 deletions

3
.gitignore vendored
View file

@ -160,3 +160,6 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
data/tfidf_matrix.pckl
data/tfidf_vectorizer.pckl

12
app.py
View file

@ -2,7 +2,7 @@ from pathlib import Path
from typing import List from typing import List
from shiny import App, ui, Inputs, Outputs, Session from shiny import App, ui, Inputs, Outputs, Session
from shiny.types import NavSetArg from shiny.types import NavSetArg
from src import mod_welcome from src import mod_welcome, mod_searchable
from src.util import load_html_str_from_file from src.util import load_html_str_from_file
import os import os
@ -13,8 +13,9 @@ footer_html: str = load_html_str_from_file(os.path.join("www", "footer.html"))
def nav_controls() -> List[NavSetArg]: def nav_controls() -> List[NavSetArg]:
return [ return [
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("Intro"), value="intro"), ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
ui.nav(ui.h5("Datensatz Analyse"), "Datensatz Analyse"), ui.nav(ui.h5("Analyse"), "Analyse"),
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"),
ui.nav_control( ui.nav_control(
ui.a( ui.a(
ui.h5("AG-Link"), ui.h5("AG-Link"),
@ -42,7 +43,7 @@ app_ui = ui.page_navbar(
), ),
align="right", align="right",
), ),
selected="intro", selected="search_engine",
fluid=False, fluid=False,
title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"), title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"),
ui.h1("Copbird")), ui.h1("Copbird")),
@ -57,7 +58,8 @@ app_ui = ui.page_navbar(
def server(input: Inputs, output: Outputs, session: Session): def server(input: Inputs, output: Outputs, session: Session):
mod_welcome.welcome_server("Intro") mod_welcome.welcome_server("intro")
mod_searchable.searchable_server("search_engine")
static_dir = Path(__file__).parent / "www" static_dir = Path(__file__).parent / "www"

151691
data/tweet_relevance.json Normal file

File diff suppressed because it is too large Load diff

333095
data/tweets_all_combined.csv Normal file

File diff suppressed because it is too large Load diff

174
src/mod_searchable.py Normal file
View file

@ -0,0 +1,174 @@
from shiny import module, ui, render, Inputs, Outputs, Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
import re
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
relevance_score_path = "data/tweet_relevance.json"
tweets_path = "data/tweets_all_combined.csv"
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
like_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M16.697 5.5c-1.222-.06-2.679.51-3.89 2.16l-.805 1.09-.806-1.09C9.984 6.01 8.526 5.44 7.304 5.5c-1.243.07-2.349.78-2.91 1.91-.552 1.12-.633 2.78.479 4.82 1.074 1.97 3.257 4.27 7.129 6.61 3.87-2.34 6.052-4.64 7.126-6.61 1.111-2.04 1.03-3.7.477-4.82-.561-1.13-1.666-1.84-2.908-1.91zm4.187 7.69c-1.351 2.48-4.001 5.12-8.379 7.67l-.503.3-.504-.3c-4.379-2.55-7.029-5.19-8.382-7.67-1.36-2.5-1.41-4.86-.514-6.67.887-1.79 2.647-2.91 4.601-3.01 1.651-.09 3.368.56 4.798 2.01 1.429-1.45 3.146-2.1 4.796-2.01 1.954.1 3.714 1.22 4.601 3.01.896 1.81.846 4.17-.514 6.67z"></path></g></svg>'
quote_count_svg = '<svg width="18px" height="18px" viewBox="0 0 57 57"><g stroke-width="0"></g><g stroke-linecap="round" stroke-linejoin="round"></g><g> <g> <circle cx="18.5" cy="31.5" r="5.5"></circle> <path d="M18.5,38c-3.584,0-6.5-2.916-6.5-6.5s2.916-6.5,6.5-6.5s6.5,2.916,6.5,6.5S22.084,38,18.5,38z M18.5,27c-2.481,0-4.5,2.019-4.5,4.5s2.019,4.5,4.5,4.5s4.5-2.019,4.5-4.5S20.981,27,18.5,27z"></path> </g> <g> <circle cx="35.5" cy="31.5" r="5.5"></circle> <path d="M35.5,38c-3.584,0-6.5-2.916-6.5-6.5s2.916-6.5,6.5-6.5s6.5,2.916,6.5,6.5S39.084,38,35.5,38z M35.5,27c-2.481,0-4.5,2.019-4.5,4.5s2.019,4.5,4.5,4.5s4.5-2.019,4.5-4.5S37.981,27,35.5,27z"></path> </g> <path d="M13,32c-0.553,0-1-0.447-1-1c0-7.72,6.28-14,14-14c0.553,0,1,0.447,1,1s-0.447,1-1,1 c-6.617,0-12,5.383-12,12C14,31.553,13.553,32,13,32z"></path> <path d="M30,32c-0.553,0-1-0.447-1-1c0-7.72,6.28-14,14-14c0.553,0,1,0.447,1,1s-0.447,1-1,1 c-6.617,0-12,5.383-12,12C31,31.553,30.553,32,30,32z"></path> </g></svg>'
link_regex = r"(https?://\S+)"
hashtag_regex = r"#(\w+)"
def replace_link(match):
url = match.group(0)
return f'<a href="{url}">{url}</a>'
def replace_hastag(match):
hashtag = match.group(0)
name = str(hashtag).removeprefix("#")
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
print("Loading data from storage")
tweets = pd.read_csv(tweets_path)
relevance_score = pd.read_csv(relevance_score_path)
tfidf_matrix = None
with open(tfidf_matrix_path, "rb") as f:
tfidf_matrix = pickle.load(f)
tfidf_vectorizer: TfidfVectorizer = None
with open(tfidf_vectorizer_path, "rb") as f:
tfidf_vectorizer = pickle.load(f)
tweets["relevance_score"] = relevance_score["relevance_score"]
tweets = tweets.drop(["user_id", "measured_at"], axis=1)
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
query_vec = tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
filtered = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[filtered])
correct_indices = filtered[indices]
result = tweets.iloc[correct_indices]
if not len(result):
return None
if limit == -1:
limit = len(result)
results = None
if sorting_method == "score":
overall = (0.6 * result['relevance_score']) * similarity[correct_indices]
results = result.loc[overall.sort_values(ascending=False).index].head(limit)
elif sorting_method == "date_new":
results = result.sort_values(by="created_at", ascending=False).head(limit)
elif sorting_method == "date_old":
results = result.sort_values(by="created_at", ascending=True).head(limit)
return results, len(result)
@ module.ui
def searchable_ui():
return ui.div(
ui.h2("Tweet Suchmaschine"),
ui.HTML("<hr>"),
ui.row(
ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
ui.column(3,
ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
style="justify-content:space-between;"
),
ui.output_ui(id="searchable_tweet_ui"),
)
@ module.server
def searchable_server(input: Inputs, output: Outputs, session: Session):
@ output
@ render.ui
def searchable_tweet_ui():
query = input.search_input()
sorting_method = input.sorting_method()
result_count = str(input.tweet_count())
if result_count == "all":
result_count = -1
else:
result_count = int(result_count)
result_pd, found_tweets = search_query(query, result_count, sorting_method=sorting_method)
style = "text-align: center;"
tweet_ui = ui.page_fluid(
ui.HTML(f"Gesamt gefundene Tweets: <b>{found_tweets}</b>"),
ui.HTML("<hr><br>"),
)
if result_pd is None:
return ui.div(
ui.h5("Keine Ergebnisse gefunden!")
)
# iterating over dataframe is bad but needed
for idx, row in result_pd.iterrows():
# prettify tweet text
tweet_text = str(row["tweet_text"]).replace("\\n", "<br>")
tweet_text = re.sub(link_regex, replace_link, tweet_text)
tweet_text = re.sub(hashtag_regex, replace_hastag, tweet_text)
tweet_link = f"https://twitter.com/{row['handle']}/status/{row['tweet_id']}"
user_handle = row['handle']
user_name = row['user_name']
tweet_ui.append(
ui.div(
ui.row(
ui.column(6, ui.HTML(
f"<b>{user_name}</b><a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"),
),
ui.row(
ui.column(12, ui.HTML("<hr>"),
ui.HTML(f"""
<style type="text/css">
.box {{
cursor: pointer;
text-align: center;
font-size: 20px;
padding-left: 1.5em;
padding-right: 1.5em;
}}
.box:hover {{
opacity: 0.7;
background: #eee;
}}
</style>
<div class="box"; onclick="location.href='{tweet_link}';">{tweet_text}</div></a>
"""),
ui.HTML("<hr>")),
),
ui.row(
ui.column(3, ui.HTML(reply_html_svg), ui.p(
f"{row['reply_count']}"), style=style, title="Antworten"),
ui.column(3, ui.HTML(retweet_html_svg), ui.p(
f"{row['retweet_count']}"), style=style, title="Retweets"),
ui.column(3, ui.HTML(like_html_svg), ui.p(
f"{row['like_count']}"), style=style, title="Likes"),
ui.column(3, ui.HTML(quote_count_svg), ui.p(
f"{row['quote_count']}"), style=style, title="Quotes"),
), style="border: 2px solid #119; margin-bottom: 1.5em; border-radius: 10px;"))
return tweet_ui

View file

@ -18,7 +18,6 @@ def welcome_ui():
[0]: https://ag-link.xyz [0]: https://ag-link.xyz
"""), """),
# ui.output_text("dataset_infos"),
ui.output_ui("dataset_infos"), ui.output_ui("dataset_infos"),
ui.h3("Ursprung der Idee"), ui.h3("Ursprung der Idee"),
ui.markdown(""" ui.markdown("""
@ -52,7 +51,7 @@ with open("data/general_analysis_results.json", "r") as f:
@ module.server @ module.server
def welcome_server(input, output, session, starting_value=0): def welcome_server(input, output, session):
@output @output
@render.ui @render.ui
def dataset_infos(): def dataset_infos():