diff --git a/app.py b/app.py
index fc029d2..4c0fba7 100644
--- a/app.py
+++ b/app.py
@@ -43,7 +43,7 @@ app_ui = ui.page_navbar(
),
align="right",
),
- selected="search_engine",
+ selected="intro",
fluid=False,
title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"),
ui.h1("Copbird")),
diff --git a/src/mod_searchable.py b/src/mod_searchable.py
index bea73a2..509f4be 100644
--- a/src/mod_searchable.py
+++ b/src/mod_searchable.py
@@ -4,7 +4,6 @@ from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
-import re
tfidf_matrix_path = "data/tfidf_matrix.pckl"
tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl"
@@ -14,21 +13,6 @@ tweets_path = "data/tweets_all_combined.csv"
reply_html_svg = ''
retweet_html_svg = ''
like_html_svg = ''
-quote_count_svg = ''
-
-link_regex = r"(https?://\S+)"
-hashtag_regex = r"#(\w+)"
-
-
-def replace_link(match):
- url = match.group(0)
- return f'{url}'
-
-
-def replace_hastag(match):
- hashtag = match.group(0)
- name = str(hashtag).removeprefix("#")
- return f'{hashtag}'
print("Loading data from storage")
@@ -45,10 +29,10 @@ with open(tfidf_vectorizer_path, "rb") as f:
tweets["relevance_score"] = relevance_score["relevance_score"]
-tweets = tweets.drop(["user_id", "measured_at"], axis=1)
+tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1)
-def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int):
+def search_query(query: str, limit: int = 5) -> pd.DataFrame:
query_vec = tfidf_vectorizer.transform([query])
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
@@ -60,60 +44,32 @@ def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (
if not len(result):
return None
- if limit == -1:
- limit = len(result)
-
- results = None
- if sorting_method == "score":
- overall = (0.6 * result['relevance_score']) * similarity[correct_indices]
- results = result.loc[overall.sort_values(ascending=False).index].head(limit)
- elif sorting_method == "date_new":
- results = result.sort_values(by="created_at", ascending=False).head(limit)
- elif sorting_method == "date_old":
- results = result.sort_values(by="created_at", ascending=True).head(limit)
-
- return results, len(result)
+ overall = result['relevance_score'] * similarity[correct_indices]
+ return result.loc[overall.sort_values(ascending=False).index].head(limit)
-@ module.ui
+@module.ui
def searchable_ui():
return ui.div(
ui.h2("Tweet Suchmaschine"),
- ui.HTML("
"),
- ui.row(
- ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")),
- ui.column(3,
- ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"),
- ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"),
- style="display: flex; flex-direction: column; align-items: center; justify-content: center;"),
- style="justify-content:space-between;"
-
- ),
+ ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein", value="Leipzig"),
+ ui.HTML("
"),
ui.output_ui(id="searchable_tweet_ui"),
)
@ module.server
def searchable_server(input: Inputs, output: Outputs, session: Session):
- @ output
- @ render.ui
+ @output
+ @render.ui
def searchable_tweet_ui():
query = input.search_input()
- sorting_method = input.sorting_method()
- result_count = str(input.tweet_count())
- if result_count == "all":
- result_count = -1
- else:
- result_count = int(result_count)
- result_pd, found_tweets = search_query(query, result_count, sorting_method=sorting_method)
+ result_pd = search_query(query, 15)
- style = "text-align: center;"
- tweet_ui = ui.page_fluid(
- ui.HTML(f"Gesamt gefundene Tweets: {found_tweets}"),
- ui.HTML("
"),
- )
+ style = "text-align: center; padding-top: 0.5em;"
+ tweet_ui = ui.page_fluid()
if result_pd is None:
return ui.div(
@@ -122,53 +78,27 @@ def searchable_server(input: Inputs, output: Outputs, session: Session):
# iterating over dataframe is bad but needed
for idx, row in result_pd.iterrows():
-
- # prettify tweet text
- tweet_text = str(row["tweet_text"]).replace("\\n", "
")
- tweet_text = re.sub(link_regex, replace_link, tweet_text)
- tweet_text = re.sub(hashtag_regex, replace_hastag, tweet_text)
-
- tweet_link = f"https://twitter.com/{row['handle']}/status/{row['tweet_id']}"
-
- user_handle = row['handle']
- user_name = row['user_name']
-
tweet_ui.append(
ui.div(
ui.row(
- ui.column(6, ui.HTML(
- f"{user_name} @{user_handle}"), style=style + "padding-top: 1.5em; "),
- ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"),
+ ui.column(9, ui.markdown(
+ f"**{row['user_name']}** *@{row['handle']}*"), style=style),
+ ui.column(3, ui.p(f"{row['created_at']}"), style=style),
),
ui.row(
- ui.column(12, ui.HTML("
"),
- ui.HTML(f"""
-
- {tweet_text}
- """),
- ui.HTML("
")),
+ ui.column(12, ui.HTML(str(row["tweet_text"]).replace(
+ "\\n", "
")), style=style + "font-size: 20px; padding:1em;"),
),
ui.row(
ui.column(3, ui.HTML(reply_html_svg), ui.p(
- f"{row['reply_count']}"), style=style, title="Antworten"),
+ f"{row['reply_count']}"), style=style),
ui.column(3, ui.HTML(retweet_html_svg), ui.p(
- f"{row['retweet_count']}"), style=style, title="Retweets"),
+ f"{row['retweet_count']}"), style=style),
ui.column(3, ui.HTML(like_html_svg), ui.p(
- f"{row['like_count']}"), style=style, title="Likes"),
- ui.column(3, ui.HTML(quote_count_svg), ui.p(
- f"{row['quote_count']}"), style=style, title="Quotes"),
- ), style="border: 2px solid #119; margin-bottom: 1.5em; border-radius: 10px;"))
+ f"{row['like_count']}"), style=style),
+ # quote_count: . Indicates approximately how many times this Tweet has been quoted by Twitter users. Example:
+ # TODO: use a nice svg for quote_count
+ ui.column(3, ui.p(f"Quote Count: {row['quote_count']}"), style=style),
+ ), style="border: 1px solid #954; margin-bottom: 1em;"))
return tweet_ui