copbird_aufarbeitung/ergebnisse_hackathon_repo/team-22/cop_link.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "5eecbeeb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm # Fortschrittsanzeige für pandas\n",
    "tqdm.pandas()\n",
    "tweet_csv = '../data/copbird_table_tweet.csv'\n",
    "entity_csv =  '../data/copbird_table_entity.csv'\n",
    "user_csv = '../data/copbird_table_user.csv'\n",
    "\n",
    "limit = None\n",
    "tweets = pd.read_csv(tweet_csv, nrows=limit)\n",
    "entities = pd.read_csv(entity_csv, nrows=limit)\n",
    "users = pd.read_csv(user_csv, nrows=limit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "1ad0f35a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "360008"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "c0a49030",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.8/dist-packages/pandas/core/strings/accessor.py:101: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n",
      "  return func(self, *args, **kwargs)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tweet_text</th>\n",
       "      <th>created_at</th>\n",
       "      <th>user_id</th>\n",
       "      <th>like_count</th>\n",
       "      <th>retweet_count</th>\n",
       "      <th>reply_count</th>\n",
       "      <th>quote_count</th>\n",
       "      <th>contains</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1321023114071969792</td>\n",
       "      <td>#Zeugengesucht\\nDie Hintergründe zu dem Tötung...</td>\n",
       "      <td>2020-10-27 09:37:08</td>\n",
       "      <td>2397974054</td>\n",
       "      <td>20.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1321025127388188673</td>\n",
       "      <td>RT @bka: EUROPE´S MOST WANTED – Sexualstraftät...</td>\n",
       "      <td>2020-10-27 09:45:08</td>\n",
       "      <td>2397974054</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    id                                         tweet_text  \\\n",
       "1  1321023114071969792  #Zeugengesucht\\nDie Hintergründe zu dem Tötung...   \n",
       "2  1321025127388188673  RT @bka: EUROPE´S MOST WANTED – Sexualstraftät...   \n",
       "\n",
       "            created_at     user_id  like_count  retweet_count  reply_count  \\\n",
       "1  2020-10-27 09:37:08  2397974054        20.0           24.0          4.0   \n",
       "2  2020-10-27 09:45:08  2397974054         NaN            NaN          NaN   \n",
       "\n",
       "   quote_count  contains  \n",
       "1          1.0      True  \n",
       "2          NaN      True  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets[\"contains\"] = tweets[\"tweet_text\"].str.contains(r'(https://t.co/)')\n",
    "tweets_subset = tweets[tweets[\"contains\"]]\n",
    "tweets_link = tweets_subset\n",
    "tweets_link.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "a5e2d37e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-43-393ebe765dff>:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n"
     ]
    }
   ],
   "source": [
    "tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n",
    "tryout = tweets_subset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "957a6b06",
   "metadata": {},
   "outputs": [],
   "source": [
    "#tryout = tryout.head(9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b768104e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "response_arr = []\n",
    "#url = tryout[\"link\"].to_string(index=False)\n",
    "df = pd.DataFrame(data=response_arr, columns=[\"full_link\"])\n",
    "\n",
    "for i in tryout.index:\n",
    "    try:\n",
    "        response = requests.get(tryout[\"link\"][i])\n",
    "        response_arr.append(response.url)\n",
    "        \n",
    "    except:\n",
    "        response_arr.append(\"error\")\n",
    "    finally:\n",
    "        \n",
    "        df = pd.DataFrame([i, response.url], columns=[\"full_link\"])\n",
    "        df.to_csv('links.csv', mode='a', header=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1c5f3260",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'response_arr' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-30722b9a87cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresponse_arr_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"nan\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'response_arr' is not defined"
     ]
    }
   ],
   "source": [
    "response_arr_orig = response_arr\n",
    "tryout[\"link_long\"]=float(\"nan\")\n",
    "index = 0\n",
    "for i in response_arr:\n",
    "    tryout[\"link_long\"][index]=i\n",
    "    index = index+1\n",
    "print(index)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}