copbird_aufarbeitung/ergebnisse_hackathon_repo/team-22/cop_link.ipynb

256 lines
8.5 KiB
Text
Raw Normal View History

2023-03-26 16:36:49 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"id": "5eecbeeb",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm # Fortschrittsanzeige für pandas\n",
"tqdm.pandas()\n",
"tweet_csv = '../data/copbird_table_tweet.csv'\n",
"entity_csv = '../data/copbird_table_entity.csv'\n",
"user_csv = '../data/copbird_table_user.csv'\n",
"\n",
"limit = None\n",
"tweets = pd.read_csv(tweet_csv, nrows=limit)\n",
"entities = pd.read_csv(entity_csv, nrows=limit)\n",
"users = pd.read_csv(user_csv, nrows=limit)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "1ad0f35a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"360008"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets.size"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c0a49030",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.8/dist-packages/pandas/core/strings/accessor.py:101: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n",
" return func(self, *args, **kwargs)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>tweet_text</th>\n",
" <th>created_at</th>\n",
" <th>user_id</th>\n",
" <th>like_count</th>\n",
" <th>retweet_count</th>\n",
" <th>reply_count</th>\n",
" <th>quote_count</th>\n",
" <th>contains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1321023114071969792</td>\n",
" <td>#Zeugengesucht\\nDie Hintergründe zu dem Tötung...</td>\n",
" <td>2020-10-27 09:37:08</td>\n",
" <td>2397974054</td>\n",
" <td>20.0</td>\n",
" <td>24.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1321025127388188673</td>\n",
" <td>RT @bka: EUROPE´S MOST WANTED Sexualstraftät...</td>\n",
" <td>2020-10-27 09:45:08</td>\n",
" <td>2397974054</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id tweet_text \\\n",
"1 1321023114071969792 #Zeugengesucht\\nDie Hintergründe zu dem Tötung... \n",
"2 1321025127388188673 RT @bka: EUROPE´S MOST WANTED Sexualstraftät... \n",
"\n",
" created_at user_id like_count retweet_count reply_count \\\n",
"1 2020-10-27 09:37:08 2397974054 20.0 24.0 4.0 \n",
"2 2020-10-27 09:45:08 2397974054 NaN NaN NaN \n",
"\n",
" quote_count contains \n",
"1 1.0 True \n",
"2 NaN True "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets[\"contains\"] = tweets[\"tweet_text\"].str.contains(r'(https://t.co/)')\n",
"tweets_subset = tweets[tweets[\"contains\"]]\n",
"tweets_link = tweets_subset\n",
"tweets_link.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "a5e2d37e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-43-393ebe765dff>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n"
]
}
],
"source": [
"tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n",
"tryout = tweets_subset\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "957a6b06",
"metadata": {},
"outputs": [],
"source": [
"#tryout = tryout.head(9)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b768104e",
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"response_arr = []\n",
"#url = tryout[\"link\"].to_string(index=False)\n",
"df = pd.DataFrame(data=response_arr, columns=[\"full_link\"])\n",
"\n",
"for i in tryout.index:\n",
" try:\n",
" response = requests.get(tryout[\"link\"][i])\n",
" response_arr.append(response.url)\n",
" \n",
" except:\n",
" response_arr.append(\"error\")\n",
" finally:\n",
" \n",
" df = pd.DataFrame([i, response.url], columns=[\"full_link\"])\n",
" df.to_csv('links.csv', mode='a', header=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1c5f3260",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'response_arr' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-30722b9a87cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresponse_arr_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"nan\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'response_arr' is not defined"
]
}
],
"source": [
"response_arr_orig = response_arr\n",
"tryout[\"link_long\"]=float(\"nan\")\n",
"index = 0\n",
"for i in response_arr:\n",
" tryout[\"link_long\"][index]=i\n",
" index = index+1\n",
"print(index)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}