diff --git a/app.py b/app.py index fc029d2..4c0fba7 100644 --- a/app.py +++ b/app.py @@ -43,7 +43,7 @@ app_ui = ui.page_navbar( ), align="right", ), - selected="search_engine", + selected="intro", fluid=False, title=ui.div(ui.img(src="favicon.ico", width="75dpi", height="75dpi"), ui.h1("Copbird")), diff --git a/src/mod_searchable.py b/src/mod_searchable.py index bea73a2..509f4be 100644 --- a/src/mod_searchable.py +++ b/src/mod_searchable.py @@ -4,7 +4,6 @@ from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import pickle -import re tfidf_matrix_path = "data/tfidf_matrix.pckl" tfidf_vectorizer_path = "data/tfidf_vectorizer.pckl" @@ -14,21 +13,6 @@ tweets_path = "data/tweets_all_combined.csv" reply_html_svg = '' retweet_html_svg = '' like_html_svg = '' -quote_count_svg = ' ' - -link_regex = r"(https?://\S+)" -hashtag_regex = r"#(\w+)" - - -def replace_link(match): - url = match.group(0) - return f'{url}' - - -def replace_hastag(match): - hashtag = match.group(0) - name = str(hashtag).removeprefix("#") - return f'{hashtag}' print("Loading data from storage") @@ -45,10 +29,10 @@ with open(tfidf_vectorizer_path, "rb") as f: tweets["relevance_score"] = relevance_score["relevance_score"] -tweets = tweets.drop(["user_id", "measured_at"], axis=1) +tweets = tweets.drop(["user_id", "measured_at", "tweet_id"], axis=1) -def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> (pd.DataFrame, int): +def search_query(query: str, limit: int = 5) -> pd.DataFrame: query_vec = tfidf_vectorizer.transform([query]) similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() @@ -60,60 +44,32 @@ def search_query(query: str, limit: int = 5, sorting_method: str = "score") -> ( if not len(result): return None - if limit == -1: - limit = len(result) - - results = None - if sorting_method == "score": - overall = (0.6 * result['relevance_score']) * similarity[correct_indices] - results = result.loc[overall.sort_values(ascending=False).index].head(limit) - elif sorting_method == "date_new": - results = result.sort_values(by="created_at", ascending=False).head(limit) - elif sorting_method == "date_old": - results = result.sort_values(by="created_at", ascending=True).head(limit) - - return results, len(result) + overall = result['relevance_score'] * similarity[correct_indices] + return result.loc[overall.sort_values(ascending=False).index].head(limit) -@ module.ui +@module.ui def searchable_ui(): return ui.div( ui.h2("Tweet Suchmaschine"), - ui.HTML("
"), - ui.row( - ui.column(6, ui.input_text("search_input", "Suche", placeholder="Gib Suchterm ein", value="Leipzig", width="100%")), - ui.column(3, - ui.input_select("sorting_method", "Sortierung", {"score": "Relevanz", "date_new": "Neuste Zuerst", "date_old": "Älteste Zuerst"}, selected="score", selectize=True, width="12em"), - ui.input_select("tweet_count", "Ergebnisse", {"5": "5", "20": "20", "50": "50", "all": "alle"}, selected="5", selectize=True, width="12em"), - style="display: flex; flex-direction: column; align-items: center; justify-content: center;"), - style="justify-content:space-between;" - - ), + ui.input_text("search_input", "Suche:", placeholder="Gebe Suchterm ein", value="Leipzig"), + ui.HTML("
"), ui.output_ui(id="searchable_tweet_ui"), ) @ module.server def searchable_server(input: Inputs, output: Outputs, session: Session): - @ output - @ render.ui + @output + @render.ui def searchable_tweet_ui(): query = input.search_input() - sorting_method = input.sorting_method() - result_count = str(input.tweet_count()) - if result_count == "all": - result_count = -1 - else: - result_count = int(result_count) - result_pd, found_tweets = search_query(query, result_count, sorting_method=sorting_method) + result_pd = search_query(query, 15) - style = "text-align: center;" - tweet_ui = ui.page_fluid( - ui.HTML(f"Gesamt gefundene Tweets: {found_tweets}"), - ui.HTML("

"), - ) + style = "text-align: center; padding-top: 0.5em;" + tweet_ui = ui.page_fluid() if result_pd is None: return ui.div( @@ -122,53 +78,27 @@ def searchable_server(input: Inputs, output: Outputs, session: Session): # iterating over dataframe is bad but needed for idx, row in result_pd.iterrows(): - - # prettify tweet text - tweet_text = str(row["tweet_text"]).replace("\\n", "
") - tweet_text = re.sub(link_regex, replace_link, tweet_text) - tweet_text = re.sub(hashtag_regex, replace_hastag, tweet_text) - - tweet_link = f"https://twitter.com/{row['handle']}/status/{row['tweet_id']}" - - user_handle = row['handle'] - user_name = row['user_name'] - tweet_ui.append( ui.div( ui.row( - ui.column(6, ui.HTML( - f"{user_name}@{user_handle}"), style=style + "padding-top: 1.5em; "), - ui.column(6, ui.p(f"{row['created_at']}"), style=style + "padding-top: 1.5em;"), + ui.column(9, ui.markdown( + f"**{row['user_name']}** *@{row['handle']}*"), style=style), + ui.column(3, ui.p(f"{row['created_at']}"), style=style), ), ui.row( - ui.column(12, ui.HTML("
"), - ui.HTML(f""" - -
{tweet_text}
- """), - ui.HTML("
")), + ui.column(12, ui.HTML(str(row["tweet_text"]).replace( + "\\n", "
")), style=style + "font-size: 20px; padding:1em;"), ), ui.row( ui.column(3, ui.HTML(reply_html_svg), ui.p( - f"{row['reply_count']}"), style=style, title="Antworten"), + f"{row['reply_count']}"), style=style), ui.column(3, ui.HTML(retweet_html_svg), ui.p( - f"{row['retweet_count']}"), style=style, title="Retweets"), + f"{row['retweet_count']}"), style=style), ui.column(3, ui.HTML(like_html_svg), ui.p( - f"{row['like_count']}"), style=style, title="Likes"), - ui.column(3, ui.HTML(quote_count_svg), ui.p( - f"{row['quote_count']}"), style=style, title="Quotes"), - ), style="border: 2px solid #119; margin-bottom: 1.5em; border-radius: 10px;")) + f"{row['like_count']}"), style=style), + # quote_count: . Indicates approximately how many times this Tweet has been quoted by Twitter users. Example: + # TODO: use a nice svg for quote_count + ui.column(3, ui.p(f"Quote Count: {row['quote_count']}"), style=style), + ), style="border: 1px solid #954; margin-bottom: 1em;")) return tweet_ui