copbird-sna/src/create_cooccurrence_matrix.py

58 lines
4.0 KiB
Python

import pandas as pd
import re
import numpy as np
tweets_path = "../raw_data/tweets.csv"
accounts_path = "../raw_data/accounts.txt"
output_path = "../data/cooccurence_matrix.csv"
tweets = pd.read_csv(tweets_path)
# with open(accounts_path, "r") as f:
# accounts_regex = f.read().replace("\n", "")
accounts_regex = r"(bpol_11\b|bpol_air_fra\b|bpol_b\b|bpol_b_einsatz\b|bpol_bepo\b|bpol_bw\b|bpol_by\b|bpol_koblenz\b|bpol_kueste\b|bpol_nord\b|bpol_nrw\b|bpol_pir\b|bremenpolizei\b|lkabawue\b|lka_bayern\b|lka_hessen\b|lka_rlp\b|pol_grafschaft\b|polizeiaalen\b|polizei_aur_wtm\b|polizei_badn\b|polizeibayern\b|polizeibb\b|polizeibb_e\b|polizei_bbg\b|polizeiberlin\b|polizeiberlin_e\b|polizeiberlin_p\b|polizeibhv\b|polizei_bs\b|polizei_ce\b|polizei_clp_vec\b|polizei_cux\b|polizei_del\b|polizei_dero\b|polizei_dh\b|polizei_el\b|polizei_ffm\b|polizeifr\b|polizei_ft\b|polizei_ger\b|polizei_gf\b|polizei_goe\b|polizei_gs\b|polizei_h\b|polizei_hal\b|polizeihamburg\b|polizei_hi\b|polizei_hk\b|polizei_hm\b|polizeihn\b|polizei_hol\b|polizei_hst\b|polizei_ka\b|polizei_kl\b|polizei_ko\b|polizeikonstanz\b|polizeilb\b|polizei_ler_emd\b|polizei_lg\b|polizeimainz\b|polizeimannheim\b|polizei_md\b|polizeimfr\b|polizei_mh\b|polizei_mse\b|polizeimuenchen\b|polizeinb\b|polizei_nbg\b|polizeineustadt\b|polizei_nh\b|polizeini_lka\b|polizei_nom\b|polizei_nrw_ac\b|polizei_nrw_bi\b|polizei_nrw_bn\b|polizei_nrw_bo\b|polizei_nrw_bor\b|polizei_nrw_coe\b|polizei_nrw_d\b|polizei_nrw_dn\b|polizei_nrw_do\b|polizei_nrw_du\b|polizei_nrw_e\b|polizei_nrw_en\b|polizei_nrw_eu\b|polizei_nrw_ge\b|polizei_nrw_gm\b|polizei_nrw_gt\b|polizei_nrw_ha\b|polizei_nrw_ham\b|polizei_nrw_hf\b|polizei_nrw_hs\b|polizei_nrw_hsk\b|polizei_nrw_hx\b|polizei_nrw_k\b|polizei_nrw_kle\b|polizei_nrw_kr\b|polizei_nrw_lip\b|polizei_nrw_lka\b|polizei_nrw_me\b|polizei_nrw_mg\b|polizei_nrw_mi\b|polizei_nrw_mk\b|polizei_nrw_ms\b|polizei_nrw_ob\b|polizei_nrw_oe\b|polizei_nrw_pb\b|polizei_nrw_rbk\b|polizei_nrw_re\b|polizei_nrw_rek\b|polizei_nrw_rkn\b|polizei_nrw_si\b|polizei_nrw_so\b|polizei_nrw_st\b|polizei_nrw_su\b|polizei_nrw_un\b|polizei_nrw_vie\b|polizei_nrw_w\b|polizei_nrw_waf\b|polizei_nrw_wes\b|polizeiobn\b|polizeiobs\b|polizeiofr\b|polizeiog\b|polizei_oh\b|polizei_oha\b|polizei_ol\b|polizeiopf\b|polizei_os\b|polizei_pf\b|polizei_pp_nb\b|polizei_pp_ros\b|polizei_ps\b|polizei_rostock\b|polizei_row\b|polizeirt\b|polizeirv\b|polizeisaarland\b|polizeisachsen\b|polizei_sdl\b|polizei_sn\b|polizei_soh\b|polizei_std\b|polizei_sth\b|polizei_suedhe\b|polizeiswn\b|polizeisws\b|polizei_sz\b|polizei_thuer\b|polizeitrier\b|polizeiufr\b|polizeiul\b|polizei_ver_ohz\b|polizeivg\b|polizei_wh\b|polizei_whv_fri\b|polizeiwittlich\b|polizei_wl\b|polizei_wob\b|polizei_zpd_ni\b|pp_rheinpfalz\b|pp_stuttgart\b|sh_polizei\b|polizeipf\b)"
# print(re.findall("polizei_nrw_d", accounts_regex, flags=re.IGNORECASE))
# quit()
# extract mentions from tweets
tweets["mentions"] = tweets.tweet_text.str.findall(accounts_regex, flags=re.IGNORECASE)
# some tweets have multiple mentions - we're gonna turn one row into n rows if the tweets has n mentions
tweets = tweets.explode("mentions")
# make sure all handles are lowercase
tweets.handle = tweets.handle.str.lower()
tweets.mentions = tweets.mentions.str.lower()
# TODO IDEA
# we could generate multiple different cocc matrices
# e.g.: weigh mentions by interaction counts, exclude RTs
# generate cooccurence matrix
mention_counts = tweets[["handle", "mentions"]].value_counts().reset_index()
cooc_matrix = (
pd.DataFrame(mention_counts)
.pivot(index="handle", columns="mentions", values=0)
.reset_index()
)
cooc_matrix.set_index("handle", inplace=True)
# networkx wants the matrix to be square, so make sure it is suare
# add columns
for r in cooc_matrix.index:
if r not in cooc_matrix.columns:
print(f"{r} is not in colums")
cooc_matrix[r] = None
not_in_index = list(filter(lambda x: x not in cooc_matrix.index, cooc_matrix.columns))
new_rows = pd.DataFrame(index=not_in_index)
cooc_matrix = pd.concat([cooc_matrix, new_rows])
# save cooccurence matrix
cooc_matrix.to_csv(output_path, sep=";")