copbird_aufarbeitung/ergebnisse_hackathon_repo/team-22/cop_link.ipynb
2023-03-26 18:36:49 +02:00

255 lines
8.5 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"id": "5eecbeeb",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm # Fortschrittsanzeige für pandas\n",
"tqdm.pandas()\n",
"tweet_csv = '../data/copbird_table_tweet.csv'\n",
"entity_csv = '../data/copbird_table_entity.csv'\n",
"user_csv = '../data/copbird_table_user.csv'\n",
"\n",
"limit = None\n",
"tweets = pd.read_csv(tweet_csv, nrows=limit)\n",
"entities = pd.read_csv(entity_csv, nrows=limit)\n",
"users = pd.read_csv(user_csv, nrows=limit)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "1ad0f35a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"360008"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets.size"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c0a49030",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.8/dist-packages/pandas/core/strings/accessor.py:101: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n",
" return func(self, *args, **kwargs)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>tweet_text</th>\n",
" <th>created_at</th>\n",
" <th>user_id</th>\n",
" <th>like_count</th>\n",
" <th>retweet_count</th>\n",
" <th>reply_count</th>\n",
" <th>quote_count</th>\n",
" <th>contains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1321023114071969792</td>\n",
" <td>#Zeugengesucht\\nDie Hintergründe zu dem Tötung...</td>\n",
" <td>2020-10-27 09:37:08</td>\n",
" <td>2397974054</td>\n",
" <td>20.0</td>\n",
" <td>24.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1321025127388188673</td>\n",
" <td>RT @bka: EUROPE´S MOST WANTED Sexualstraftät...</td>\n",
" <td>2020-10-27 09:45:08</td>\n",
" <td>2397974054</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id tweet_text \\\n",
"1 1321023114071969792 #Zeugengesucht\\nDie Hintergründe zu dem Tötung... \n",
"2 1321025127388188673 RT @bka: EUROPE´S MOST WANTED Sexualstraftät... \n",
"\n",
" created_at user_id like_count retweet_count reply_count \\\n",
"1 2020-10-27 09:37:08 2397974054 20.0 24.0 4.0 \n",
"2 2020-10-27 09:45:08 2397974054 NaN NaN NaN \n",
"\n",
" quote_count contains \n",
"1 1.0 True \n",
"2 NaN True "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweets[\"contains\"] = tweets[\"tweet_text\"].str.contains(r'(https://t.co/)')\n",
"tweets_subset = tweets[tweets[\"contains\"]]\n",
"tweets_link = tweets_subset\n",
"tweets_link.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "a5e2d37e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-43-393ebe765dff>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n"
]
}
],
"source": [
"tweets_subset[\"link\"] = tweets_subset[\"tweet_text\"].str.extract(r'(https://t.co/[\\w]*)[\\w$|\\s|\\n][^(.*).{1} ]')\n",
"tryout = tweets_subset\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "957a6b06",
"metadata": {},
"outputs": [],
"source": [
"#tryout = tryout.head(9)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b768104e",
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"response_arr = []\n",
"#url = tryout[\"link\"].to_string(index=False)\n",
"df = pd.DataFrame(data=response_arr, columns=[\"full_link\"])\n",
"\n",
"for i in tryout.index:\n",
" try:\n",
" response = requests.get(tryout[\"link\"][i])\n",
" response_arr.append(response.url)\n",
" \n",
" except:\n",
" response_arr.append(\"error\")\n",
" finally:\n",
" \n",
" df = pd.DataFrame([i, response.url], columns=[\"full_link\"])\n",
" df.to_csv('links.csv', mode='a', header=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1c5f3260",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'response_arr' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-30722b9a87cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresponse_arr_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"nan\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse_arr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtryout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"link_long\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'response_arr' is not defined"
]
}
],
"source": [
"response_arr_orig = response_arr\n",
"tryout[\"link_long\"]=float(\"nan\")\n",
"index = 0\n",
"for i in response_arr:\n",
" tryout[\"link_long\"][index]=i\n",
" index = index+1\n",
"print(index)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}