256 lines
8.5 KiB
Text
256 lines
8.5 KiB
Text
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 40,
|
|||
|
"id": "5eecbeeb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from tqdm import tqdm # Fortschrittsanzeige für pandas\n",
|
|||
|
"tqdm.pandas()\n",
|
|||
|
"tweet_csv = '../data/copbird_table_tweet.csv'\n",
|
|||
|
"entity_csv = '../data/copbird_table_entity.csv'\n",
|
|||
|
"user_csv = '../data/copbird_table_user.csv'\n",
|
|||
|
"\n",
|
|||
|
"limit = None\n",
|
|||
|
"tweets = pd.read_csv(tweet_csv, nrows=limit)\n",
|
|||
|
"entities = pd.read_csv(entity_csv, nrows=limit)\n",
|
|||
|
"users = pd.read_csv(user_csv, nrows=limit)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"id": "1ad0f35a",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"360008"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"tweets.size"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 42,
|
|||
|
"id": "c0a49030",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/pandas/core/strings/accessor.py:101: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n",
|
|||
|
" return func(self, *args, **kwargs)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>tweet_text</th>\n",
|
|||
|
" <th>created_at</th>\n",
|
|||
|
" <th>user_id</th>\n",
|
|||
|
" <th>like_count</th>\n",
|
|||
|
" <th>retweet_count</th>\n",
|
|||
|
" <th>reply_count</th>\n",
|
|||
|
" <th>quote_count</th>\n",
|
|||
|
" <th>contains</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1321023114071969792</td>\n",
|
|||
|
" <td>#Zeugengesucht\\nDie Hintergründe zu dem Tötung...</td>\n",
|
|||
|
" <td>2020-10-27 09:37:08</td>\n",
|
|||
|
" <td>2397974054</td>\n",
|
|||
|
" <td>20.0</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1321025127388188673</td>\n",
|
|||
|
" <td>RT @bka: EUROPE´S MOST WANTED – Sexualstraftät...</td>\n",
|
|||
|
" <td>2020-10-27 09:45:08</td>\n",
|
|||
|
" <td>2397974054</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id tweet_text \\\n",
|
|||
|
"1 1321023114071969792 #Zeugengesucht\\nDie Hintergründe zu dem Tötung... \n",
|
|||
|
"2 1321025127388188673 RT @bka: EUROPE´S MOST WANTED – Sexualstraftät... \n",
|
|||
|
"\n",
|
|||
|
" created_at user_id like_count retweet_count reply_count \\\n",
|
|||
|
"1 2020-10-27 09:37:08 2397974054 20.0 24.0 4.0 \n",
|
|||
|
"2 2020-10-27 09:45:08 2397974054 NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" quote_count contains \n",
|
|||
|
"1 1.0 True \n",
|
|||
|
"2 NaN True "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 42,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"tweets[\"contains\"] = tweets[\"tweet_text\"].str.contains(r'(https://t.co/)')\n",
|
|||
|
"tweets_subset = tweets[tweets[\"contains\"]]\n",
|
|||
|
"tweets_link = tweets_subset\n",
|
|||
|
"tweets_link.head(2)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"id": "a5e2d37e",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<ipython-input-43-393ebe765dff>:1: SettingWithCopyWarning: \n",
|
|||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|||
|
"\n",
|
|||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|||
|
" tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n",
|
|||
|
"tryout = tweets_subset\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"id": "957a6b06",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"#tryout = tryout.head(9)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"id": "b768104e",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import io\n",
|
|||
|
"import requests\n",
|
|||
|
"response_arr = []\n",
|
|||
|
"#url = tryout[\"link\"].to_string(index=False)\n",
|
|||
|
"df = pd.DataFrame(data=response_arr, columns=[\"full_link\"])\n",
|
|||
|
"\n",
|
|||
|
"for i in tryout.index:\n",
|
|||
|
" try:\n",
|
|||
|
" response = requests.get(tryout[\"link\"][i])\n",
|
|||
|
" response_arr.append(response.url)\n",
|
|||
|
" \n",
|
|||
|
" except:\n",
|
|||
|
" response_arr.append(\"error\")\n",
|
|||
|
" finally:\n",
|
|||
|
" \n",
|
|||
|
" df = pd.DataFrame([i, response.url], columns=[\"full_link\"])\n",
|
|||
|
" df.to_csv('links.csv', mode='a', header=False)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "1c5f3260",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "NameError",
|
|||
|
"evalue": "name 'response_arr' is not defined",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|||
|
"\u001b[0;32m<ipython-input-9-30722b9a87cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresponse_arr_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"nan\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;31mNameError\u001b[0m: name 'response_arr' is not defined"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"response_arr_orig = response_arr\n",
|
|||
|
"tryout[\"link_long\"]=float(\"nan\")\n",
|
|||
|
"index = 0\n",
|
|||
|
"for i in response_arr:\n",
|
|||
|
" tryout[\"link_long\"][index]=i\n",
|
|||
|
" index = index+1\n",
|
|||
|
"print(index)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.8.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|