{ "cells": [ { "cell_type": "code", "execution_count": 40, "id": "5eecbeeb", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from tqdm import tqdm # Fortschrittsanzeige für pandas\n", "tqdm.pandas()\n", "tweet_csv = '../data/copbird_table_tweet.csv'\n", "entity_csv = '../data/copbird_table_entity.csv'\n", "user_csv = '../data/copbird_table_user.csv'\n", "\n", "limit = None\n", "tweets = pd.read_csv(tweet_csv, nrows=limit)\n", "entities = pd.read_csv(entity_csv, nrows=limit)\n", "users = pd.read_csv(user_csv, nrows=limit)" ] }, { "cell_type": "code", "execution_count": 41, "id": "1ad0f35a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "360008" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets.size" ] }, { "cell_type": "code", "execution_count": 42, "id": "c0a49030", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/pandas/core/strings/accessor.py:101: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n", " return func(self, *args, **kwargs)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtweet_textcreated_atuser_idlike_countretweet_countreply_countquote_countcontains
11321023114071969792#Zeugengesucht\\nDie Hintergründe zu dem Tötung...2020-10-27 09:37:08239797405420.024.04.01.0True
21321025127388188673RT @bka: EUROPE´S MOST WANTED – Sexualstraftät...2020-10-27 09:45:082397974054NaNNaNNaNNaNTrue
\n", "
" ], "text/plain": [ " id tweet_text \\\n", "1 1321023114071969792 #Zeugengesucht\\nDie Hintergründe zu dem Tötung... \n", "2 1321025127388188673 RT @bka: EUROPE´S MOST WANTED – Sexualstraftät... \n", "\n", " created_at user_id like_count retweet_count reply_count \\\n", "1 2020-10-27 09:37:08 2397974054 20.0 24.0 4.0 \n", "2 2020-10-27 09:45:08 2397974054 NaN NaN NaN \n", "\n", " quote_count contains \n", "1 1.0 True \n", "2 NaN True " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets[\"contains\"] = tweets[\"tweet_text\"].str.contains(r'(https://t.co/)')\n", "tweets_subset = tweets[tweets[\"contains\"]]\n", "tweets_link = tweets_subset\n", "tweets_link.head(2)" ] }, { "cell_type": "code", "execution_count": 43, "id": "a5e2d37e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n" ] } ], "source": [ "tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n", "tryout = tweets_subset\n" ] }, { "cell_type": "code", "execution_count": 44, "id": "957a6b06", "metadata": {}, "outputs": [], "source": [ "#tryout = tryout.head(9)" ] }, { "cell_type": "code", "execution_count": null, "id": "b768104e", "metadata": {}, "outputs": [], "source": [ "import io\n", "import requests\n", "response_arr = []\n", "#url = tryout[\"link\"].to_string(index=False)\n", "df = pd.DataFrame(data=response_arr, columns=[\"full_link\"])\n", "\n", "for i in tryout.index:\n", " try:\n", " response = requests.get(tryout[\"link\"][i])\n", " response_arr.append(response.url)\n", " \n", " except:\n", " response_arr.append(\"error\")\n", " finally:\n", " \n", " df = pd.DataFrame([i, response.url], columns=[\"full_link\"])\n", " df.to_csv('links.csv', mode='a', header=False)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "1c5f3260", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'response_arr' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresponse_arr_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"nan\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'response_arr' is not defined" ] } ], "source": [ "response_arr_orig = response_arr\n", "tryout[\"link_long\"]=float(\"nan\")\n", "index = 0\n", "for i in response_arr:\n", " tryout[\"link_long\"][index]=i\n", " index = index+1\n", "print(index)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }