generate cooc matrix

This commit is contained in:
Lukas Z 2022-07-28 16:34:13 +02:00
parent 2e38ea848e
commit aea10d5f24
3 changed files with 192 additions and 11 deletions

View file

@ -0,0 +1,36 @@
import pandas as pd
import re
tweets_path = "../raw_data/tweets.csv"
accounts_path = "../raw_data/accounts.txt"
output_path = "../data/cooccurence_matrix.csv"
tweets = pd.read_csv(tweets_path)
with open(accounts_path, "r") as f:
accounts_regex = f.read().replace("\n","")
# extract mentions from tweets
tweets["mentions"] = tweets.tweet_text.str.findall(accounts_regex, flags=re.IGNORECASE)
# some tweets have multiple mentions - we're gonna turn one row into n rows if the tweets has n mentions
tweets = tweets.explode("mentions")
# make sure all handles are lowercase
tweets.handle = tweets.handle.str.lower()
tweets.mentions = tweets.mentions.str.lower()
# TODO IDEA
# we could generate multiple different cocc matrices
# e.g.: weigh mentions by interaction counts, exclude RTs
# generate cooccurence matrix
mention_counts = tweets[["handle", "mentions"]].value_counts().reset_index()
cooc_matrix = pd.DataFrame(mention_counts).pivot(index="handle", columns="mentions",
values=0).reset_index()
#save cooccurence matrix
cooc_matrix.to_csv(output_path, sep=";", index=False)

View file

@ -1,11 +0,0 @@
import pandas as pd
tweets_path = "../raw_data/tweets.csv"
tweets = pd.read_csv(tweets_path)
# TODO extract mentions from tweets
# TODO generate cooccurence matrix
# TODO save cooccurence matrix