Compare commits
6 commits
19d21c91ee
...
8787366ed4
Author | SHA1 | Date | |
---|---|---|---|
8787366ed4 | |||
5acc0642b6 | |||
933e1817ca | |||
d05100d1f5 | |||
e04a681d5a | |||
44b9ebbaab |
6 changed files with 484971 additions and 7 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -160,3 +160,6 @@ cython_debug/
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
||||||
|
data/tfidf_matrix.pckl
|
||||||
|
data/tfidf_vectorizer.pckl
|
||||||
|
|
12
app.py
12
app.py
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
from shiny import App, ui, Inputs, Outputs, Session
|
from shiny import App, ui, Inputs, Outputs, Session
|
||||||
from shiny.types import NavSetArg
|
from shiny.types import NavSetArg
|
||||||
from src import mod_welcome
|
from src import mod_welcome, mod_searchable
|
||||||
from src.util import load_html_str_from_file
|
from src.util import load_html_str_from_file
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -13,8 +13,9 @@ footer_html: str = load_html_str_from_file(os.path.join("www", "footer.html"))
|
||||||
|
|
||||||
def nav_controls() -> List[NavSetArg]:
|
def nav_controls() -> List[NavSetArg]:
|
||||||
return [
|
return [
|
||||||
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("Intro"), value="intro"),
|
ui.nav(ui.h5("Intro"), mod_welcome.welcome_ui("intro"), value="intro"),
|
||||||
ui.nav(ui.h5("Datensatz Analyse"), "Datensatz Analyse"),
|
ui.nav(ui.h5("Analyse"), "Analyse"),
|
||||||
|
ui.nav(ui.h5("Suchmaschine"), mod_searchable.searchable_ui("search_engine"), value="search_engine"),
|
||||||
ui.nav_control(
|
ui.nav_control(
|
||||||
ui.a(
|
ui.a(
|
||||||
ui.h5("AG-Link"),
|
ui.h5("AG-Link"),
|
||||||
|
@ -42,7 +43,7 @@ app_ui = ui.page_navbar(
|
||||||
),
|
),
|
||||||
align="right",
|
align="right",
|
||||||
),
|
),
|
||||||
selected="intro",
|
selected="search_engine",
|
||||||
fluid=False,
|
fluid=False,
|
||||||
title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"),
|
title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"),
|
||||||
ui.h1("Copbird")),
|
ui.h1("Copbird")),
|
||||||
|
@ -57,7 +58,8 @@ app_ui = ui.page_navbar(
|
||||||
|
|
||||||
|
|
||||||
def server(input: Inputs, output: Outputs, session: Session):
|
def server(input: Inputs, output: Outputs, session: Session):
|
||||||
mod_welcome.welcome_server("Intro")
|
mod_welcome.welcome_server("intro")
|
||||||
|
mod_searchable.searchable_server("search_engine")
|
||||||
|
|
||||||
|
|
||||||
static_dir = Path(__file__).parent / "www"
|
static_dir = Path(__file__).parent / "www"
|
||||||
|
|
151691
data/tweet_relevance.json
Normal file
151691
data/tweet_relevance.json
Normal file
File diff suppressed because it is too large
Load diff
333095
data/tweets_all_combined.csv
Normal file
333095
data/tweets_all_combined.csv
Normal file
File diff suppressed because it is too large
Load diff
174
src/mod_searchable.py
Normal file
174
src/mod_searchable.py
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
from shiny import module, ui, render, Inputs, Outputs, Session
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
|
tfidf_matrix_path = "data/tfidf_matrix.pckl"
|
||||||
|
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
|
||||||
|
relevance_score_path = "data/tweet_relevance.json"
|
||||||
|
tweets_path = "data/tweets_all_combined.csv"
|
||||||
|
|
||||||
|
reply_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M1.751 10c0-4.42 3.584-8 8.005-8h4.366c4.49 0 8.129 3.64 8.129 8.13 0 2.96-1.607 5.68-4.196 7.11l-8.054 4.46v-3.69h-.067c-4.49.1-8.183-3.51-8.183-8.01zm8.005-6c-3.317 0-6.005 2.69-6.005 6 0 3.37 2.77 6.08 6.138 6.01l.351-.01h1.761v2.3l5.087-2.81c1.951-1.08 3.163-3.13 3.163-5.36 0-3.39-2.744-6.13-6.129-6.13H9.756z"></path></g></svg>'
|
||||||
|
retweet_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M4.5 3.88l4.432 4.14-1.364 1.46L5.5 7.55V16c0 1.1.896 2 2 2H13v2H7.5c-2.209 0-4-1.79-4-4V7.55L1.432 9.48.068 8.02 4.5 3.88zM16.5 6H11V4h5.5c2.209 0 4 1.79 4 4v8.45l2.068-1.93 1.364 1.46-4.432 4.14-4.432-4.14 1.364-1.46 2.068 1.93V8c0-1.1-.896-2-2-2z"></path></g></svg>'
|
||||||
|
like_html_svg = '<svg width="18px" height="18px" viewBox="0 0 24 24" aria-hidden="true"><g><path d="M16.697 5.5c-1.222-.06-2.679.51-3.89 2.16l-.805 1.09-.806-1.09C9.984 6.01 8.526 5.44 7.304 5.5c-1.243.07-2.349.78-2.91 1.91-.552 1.12-.633 2.78.479 4.82 1.074 1.97 3.257 4.27 7.129 6.61 3.87-2.34 6.052-4.64 7.126-6.61 1.111-2.04 1.03-3.7.477-4.82-.561-1.13-1.666-1.84-2.908-1.91zm4.187 7.69c-1.351 2.48-4.001 5.12-8.379 7.67l-.503.3-.504-.3c-4.379-2.55-7.029-5.19-8.382-7.67-1.36-2.5-1.41-4.86-.514-6.67.887-1.79 2.647-2.91 4.601-3.01 1.651-.09 3.368.56 4.798 2.01 1.429-1.45 3.146-2.1 4.796-2.01 1.954.1 3.714 1.22 4.601 3.01.896 1.81.846 4.17-.514 6.67z"></path></g></svg>'
|
||||||
|
quote_count_svg = '<svg width="18px" height="18px" viewBox="0 0 57 57"><g stroke-width="0"></g><g stroke-linecap="round" stroke-linejoin="round"></g><g> <g> <circle cx="18.5" cy="31.5" r="5.5"></circle> <path d="M18.5,38c-3.584,0-6.5-2.916-6.5-6.5s2.916-6.5,6.5-6.5s6.5,2.916,6.5,6.5S22.084,38,18.5,38z M18.5,27c-2.481,0-4.5,2.019-4.5,4.5s2.019,4.5,4.5,4.5s4.5-2.019,4.5-4.5S20.981,27,18.5,27z"></path> </g> <g> <circle cx="35.5" cy="31.5" r="5.5"></circle> <path d="M35.5,38c-3.584,0-6.5-2.916-6.5-6.5s2.916-6.5,6.5-6.5s6.5,2.916,6.5,6.5S39.084,38,35.5,38z M35.5,27c-2.481,0-4.5,2.019-4.5,4.5s2.019,4.5,4.5,4.5s4.5-2.019,4.5-4.5S37.981,27,35.5,27z"></path> </g> <path d="M13,32c-0.553,0-1-0.447-1-1c0-7.72,6.28-14,14-14c0.553,0,1,0.447,1,1s-0.447,1-1,1 c-6.617,0-12,5.383-12,12C14,31.553,13.553,32,13,32z"></path> <path d="M30,32c-0.553,0-1-0.447-1-1c0-7.72,6.28-14,14-14c0.553,0,1,0.447,1,1s-0.447,1-1,1 c-6.617,0-12,5.383-12,12C31,31.553,30.553,32,30,32z"></path> </g></svg>'
|
||||||
|
|
||||||
|
link_regex = r"(https?://\S+)"
|
||||||
|
hashtag_regex = r"#(\w+)"
|
||||||
|
|
||||||
|
|
||||||
|
def replace_link(match):
|
||||||
|
url = match.group(0)
|
||||||
|
return f'<a href="{url}">{url}</a>'
|
||||||
|
|
||||||
|
|
||||||
|
def replace_hastag(match):
|
||||||
|
hashtag = match.group(0)
|
||||||
|
name = str(hashtag).removeprefix("#")
|
||||||
|
return f'<a href="https://twitter.com/search?q=%23{name}" style="text-decoration:none">{hashtag}</a>'
|
||||||
|
|
||||||
|
|
||||||
|
print("Loading data from storage")
|
||||||
|
tweets = pd.read_csv(tweets_path)
|
||||||
|
relevance_score = pd.read_csv(relevance_score_path)
|
||||||
|
|
||||||
|
tfidf_matrix = None
|
||||||
|
with open(tfidf_matrix_path, "rb") as f:
|
||||||
|
tfidf_matrix = pickle.load(f)
|
||||||
|
|
||||||
|
tfidf_vectorizer: TfidfVectorizer = None
|
||||||
|
with open(tfidf_vectorizer_path, "rb") as f:
|
||||||
|
tfidf_vectorizer = pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
tweets["relevance_score"] = relevance_score["relevance_score"]
|
||||||
|
tweets = tweets.drop(["user_id", "measured_at"], axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
|
||||||
|
query_vec = tfidf_vectorizer.transform([query])
|
||||||
|
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
|
||||||
|
|
||||||
|
filtered = np.where(similarity != 0)[0]
|
||||||
|
indices = np.argsort(-similarity[filtered])
|
||||||
|
correct_indices = filtered[indices]
|
||||||
|
result = tweets.iloc[correct_indices]
|
||||||
|
|
||||||
|
if not len(result):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if limit == -1:
|
||||||
|
limit = len(result)
|
||||||
|
|
||||||
|
results = None
|
||||||
|
if sorting_method == "score":
|
||||||
|
overall = (0.6 * result['relevance_score']) * similarity[correct_indices]
|
||||||
|
results = result.loc[overall.sort_values(ascending=False).index].head(limit)
|
||||||
|
elif sorting_method == "date_new":
|
||||||
|
results = result.sort_values(by="created_at", ascending=False).head(limit)
|
||||||
|
elif sorting_method == "date_old":
|
||||||
|
results = result.sort_values(by="created_at", ascending=True).head(limit)
|
||||||
|
|
||||||
|
return results, len(result)
|
||||||
|
|
||||||
|
|
||||||
|
@ module.ui
|
||||||
|
def searchable_ui():
|
||||||
|
return ui.div(
|
||||||
|
ui.h2("Tweet Suchmaschine"),
|
||||||
|
ui.HTML("<hr>"),
|
||||||
|
ui.row(
|
||||||
|
ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
|
||||||
|
ui.column(3,
|
||||||
|
ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
|
||||||
|
ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
|
||||||
|
style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
|
||||||
|
style="justify-content:space-between;"
|
||||||
|
|
||||||
|
),
|
||||||
|
ui.output_ui(id="searchable_tweet_ui"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ module.server
|
||||||
|
def searchable_server(input: Inputs, output: Outputs, session: Session):
|
||||||
|
@ output
|
||||||
|
@ render.ui
|
||||||
|
def searchable_tweet_ui():
|
||||||
|
|
||||||
|
query = input.search_input()
|
||||||
|
sorting_method = input.sorting_method()
|
||||||
|
result_count = str(input.tweet_count())
|
||||||
|
if result_count == "all":
|
||||||
|
result_count = -1
|
||||||
|
else:
|
||||||
|
result_count = int(result_count)
|
||||||
|
|
||||||
|
result_pd, found_tweets = search_query(query, result_count, sorting_method=sorting_method)
|
||||||
|
|
||||||
|
style = "text-align: center;"
|
||||||
|
tweet_ui = ui.page_fluid(
|
||||||
|
ui.HTML(f"Gesamt gefundene Tweets: <b>{found_tweets}</b>"),
|
||||||
|
ui.HTML("<hr><br>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if result_pd is None:
|
||||||
|
return ui.div(
|
||||||
|
ui.h5("Keine Ergebnisse gefunden!")
|
||||||
|
)
|
||||||
|
|
||||||
|
# iterating over dataframe is bad but needed
|
||||||
|
for idx, row in result_pd.iterrows():
|
||||||
|
|
||||||
|
# prettify tweet text
|
||||||
|
tweet_text = str(row["tweet_text"]).replace("\\n", "<br>")
|
||||||
|
tweet_text = re.sub(link_regex, replace_link, tweet_text)
|
||||||
|
tweet_text = re.sub(hashtag_regex, replace_hastag, tweet_text)
|
||||||
|
|
||||||
|
tweet_link = f"https://twitter.com/{row['handle']}/status/{row['tweet_id']}"
|
||||||
|
|
||||||
|
user_handle = row['handle']
|
||||||
|
user_name = row['user_name']
|
||||||
|
|
||||||
|
tweet_ui.append(
|
||||||
|
ui.div(
|
||||||
|
ui.row(
|
||||||
|
ui.column(6, ui.HTML(
|
||||||
|
f"<b>{user_name}</b> <a href='https://twitter.com/{user_handle}' style='text-decoration: none;'>@{user_handle}</a>"), style=style + "padding-top: 1.5em; "),
|
||||||
|
ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"),
|
||||||
|
),
|
||||||
|
ui.row(
|
||||||
|
ui.column(12, ui.HTML("<hr>"),
|
||||||
|
ui.HTML(f"""
|
||||||
|
<style type="text/css">
|
||||||
|
.box {{
|
||||||
|
cursor: pointer;
|
||||||
|
text-align: center;
|
||||||
|
font-size: 20px;
|
||||||
|
padding-left: 1.5em;
|
||||||
|
padding-right: 1.5em;
|
||||||
|
}}
|
||||||
|
.box:hover {{
|
||||||
|
opacity: 0.7;
|
||||||
|
background: #eee;
|
||||||
|
}}
|
||||||
|
</style>
|
||||||
|
<div class="box"; onclick="location.href='{tweet_link}';">{tweet_text}</div></a>
|
||||||
|
"""),
|
||||||
|
ui.HTML("<hr>")),
|
||||||
|
),
|
||||||
|
ui.row(
|
||||||
|
ui.column(3, ui.HTML(reply_html_svg), ui.p(
|
||||||
|
f"{row['reply_count']}"), style=style, title="Antworten"),
|
||||||
|
ui.column(3, ui.HTML(retweet_html_svg), ui.p(
|
||||||
|
f"{row['retweet_count']}"), style=style, title="Retweets"),
|
||||||
|
ui.column(3, ui.HTML(like_html_svg), ui.p(
|
||||||
|
f"{row['like_count']}"), style=style, title="Likes"),
|
||||||
|
ui.column(3, ui.HTML(quote_count_svg), ui.p(
|
||||||
|
f"{row['quote_count']}"), style=style, title="Quotes"),
|
||||||
|
), style="border: 2px solid #119; margin-bottom: 1.5em; border-radius: 10px;"))
|
||||||
|
|
||||||
|
return tweet_ui
|
|
@ -18,7 +18,6 @@ def welcome_ui():
|
||||||
|
|
||||||
[0]: https://ag-link.xyz
|
[0]: https://ag-link.xyz
|
||||||
"""),
|
"""),
|
||||||
# ui.output_text("dataset_infos"),
|
|
||||||
ui.output_ui("dataset_infos"),
|
ui.output_ui("dataset_infos"),
|
||||||
ui.h3("Ursprung der Idee"),
|
ui.h3("Ursprung der Idee"),
|
||||||
ui.markdown("""
|
ui.markdown("""
|
||||||
|
@ -52,7 +51,7 @@ with open("data/general_analysis_results.json", "r") as f:
|
||||||
|
|
||||||
|
|
||||||
@ module.server
|
@ module.server
|
||||||
def welcome_server(input, output, session, starting_value=0):
|
def welcome_server(input, output, session):
|
||||||
@output
|
@output
|
||||||
@render.ui
|
@render.ui
|
||||||
def dataset_infos():
|
def dataset_infos():
|
||||||
|
|
Loading…
Reference in a new issue