From 8787366ed4376a9c31c91e41a4cdbe44fa66b389 Mon Sep 17 00:00:00 2001 From: procrastimax Date: Wed, 12 Jul 2023 15:08:18 +0200 Subject: [PATCH] Completes search engine for tweets --- src/mod_searchable.py | 83 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 16 deletions(-) diff --git a/src/mod_searchable.py b/src/mod_searchable.py index c59b142..bea73a2 100644 --- a/src/mod_searchable.py +++ b/src/mod_searchable.py @@ -24,6 +24,7 @@ def replace_link(match): url = match.group(0) return f'{url}' + def replace_hastag(match): hashtag = match.group(0) name = str(hashtag).removeprefix("#") @@ -44,10 +45,10 @@ with open(tfidf_vectorizer_path, "rb") as f: tweets["relevance_score"] = relevance_score["relevance_score"] -tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1) +tweets = tweets.drop(["user_id", "measured_at"], axis=1) -def search_query(query: str, limit: int = 5) -> pd.DataFrame: +def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): query_vec = tfidf_vectorizer.transform([query]) similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() @@ -59,16 +60,35 @@ def search_query(query: str, limit: int = 5) -> pd.DataFrame: if not len(result): return None - overall = (0.6 * result['relevance_score']) * similarity[correct_indices] - return result.loc[overall.sort_values(ascending=False).index].head(limit) + if limit == -1: + limit = len(result) + + results = None + if sorting_method == "score": + overall = (0.6 * result['relevance_score']) * similarity[correct_indices] + results = result.loc[overall.sort_values(ascending=False).index].head(limit) + elif sorting_method == "date_new": + results = result.sort_values(by="created_at", ascending=False).head(limit) + elif sorting_method == "date_old": + results = result.sort_values(by="created_at", ascending=True).head(limit) + + return results, len(result) @ module.ui def searchable_ui(): return ui.div( ui.h2("Tweet Suchmaschine"), - ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein", value="Leipzig"), - ui.HTML("
"), + ui.HTML("
"), + ui.row( + ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), + ui.column(3, + ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), + ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), + style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), + style="justify-content:space-between;" + + ), ui.output_ui(id="searchable_tweet_ui"), ) @@ -80,11 +100,20 @@ def searchable_server(input: Inputs, output: Outputs, session: Session): def searchable_tweet_ui(): query = input.search_input() + sorting_method = input.sorting_method() + result_count = str(input.tweet_count()) + if result_count == "all": + result_count = -1 + else: + result_count = int(result_count) - result_pd = search_query(query, 25) + result_pd, found_tweets = search_query(query, result_count, sorting_method=sorting_method) style = "text-align: center;" - tweet_ui = ui.page_fluid() + tweet_ui = ui.page_fluid( + ui.HTML(f"Gesamt gefundene Tweets: {found_tweets}"), + ui.HTML("

"), + ) if result_pd is None: return ui.div( @@ -99,25 +128,47 @@ def searchable_server(input: Inputs, output: Outputs, session: Session): tweet_text = re.sub(link_regex, replace_link, tweet_text) tweet_text = re.sub(hashtag_regex, replace_hastag, tweet_text) + tweet_link = f"https://twitter.com/{row['handle']}/status/{row['tweet_id']}" + + user_handle = row['handle'] + user_name = row['user_name'] + tweet_ui.append( ui.div( ui.row( - ui.column(6, ui.markdown( - f"**{row['user_name']}** *@{row['handle']}*"), style=style + "padding-top: 0.5em;"), - ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 0.5em;"), + ui.column(6, ui.HTML( + f"{user_name}@{user_handle}"), style=style + "padding-top: 1.5em; "), + ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"), ), ui.row( - ui.column(12, ui.div(ui.HTML(tweet_text), style=style + "font-size: 20px; margin: 1em; padding: 0.25em; border: 1px solid #bbb;")), + ui.column(12, ui.HTML("
"), + ui.HTML(f""" + +
{tweet_text}
+ """), + ui.HTML("
")), ), ui.row( ui.column(3, ui.HTML(reply_html_svg), ui.p( - f"{row['reply_count']}"), style=style, title="Anzahl Antworten"), + f"{row['reply_count']}"), style=style, title="Antworten"), ui.column(3, ui.HTML(retweet_html_svg), ui.p( - f"{row['retweet_count']}"), style=style, title="Anzahl Retweets"), + f"{row['retweet_count']}"), style=style, title="Retweets"), ui.column(3, ui.HTML(like_html_svg), ui.p( - f"{row['like_count']}"), style=style, title="Anzahl Likes"), + f"{row['like_count']}"), style=style, title="Likes"), ui.column(3, ui.HTML(quote_count_svg), ui.p( - f"{row['quote_count']}"), style=style, title="Anzahl Quotes"), + f"{row['quote_count']}"), style=style, title="Quotes"), ), style="border: 2px solid #119; margin-bottom: 1.5em; border-radius: 10px;")) return tweet_ui