generate cooc matrix
This commit is contained in:
parent
2e38ea848e
commit
aea10d5f24
3 changed files with 192 additions and 11 deletions
36
src/create_cooccurrence_matrix.py
Normal file
36
src/create_cooccurrence_matrix.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import pandas as pd
|
||||
import re
|
||||
|
||||
tweets_path = "../raw_data/tweets.csv"
|
||||
accounts_path = "../raw_data/accounts.txt"
|
||||
|
||||
output_path = "../data/cooccurence_matrix.csv"
|
||||
|
||||
|
||||
tweets = pd.read_csv(tweets_path)
|
||||
|
||||
with open(accounts_path, "r") as f:
|
||||
accounts_regex = f.read().replace("\n","")
|
||||
|
||||
# extract mentions from tweets
|
||||
tweets["mentions"] = tweets.tweet_text.str.findall(accounts_regex, flags=re.IGNORECASE)
|
||||
|
||||
# some tweets have multiple mentions - we're gonna turn one row into n rows if the tweets has n mentions
|
||||
tweets = tweets.explode("mentions")
|
||||
|
||||
# make sure all handles are lowercase
|
||||
tweets.handle = tweets.handle.str.lower()
|
||||
tweets.mentions = tweets.mentions.str.lower()
|
||||
|
||||
# TODO IDEA
|
||||
# we could generate multiple different cocc matrices
|
||||
# e.g.: weigh mentions by interaction counts, exclude RTs
|
||||
|
||||
|
||||
# generate cooccurence matrix
|
||||
mention_counts = tweets[["handle", "mentions"]].value_counts().reset_index()
|
||||
cooc_matrix = pd.DataFrame(mention_counts).pivot(index="handle", columns="mentions",
|
||||
values=0).reset_index()
|
||||
|
||||
#save cooccurence matrix
|
||||
cooc_matrix.to_csv(output_path, sep=";", index=False)
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
tweets_path = "../raw_data/tweets.csv"
|
||||
|
||||
tweets = pd.read_csv(tweets_path)
|
||||
|
||||
# TODO extract mentions from tweets
|
||||
|
||||
# TODO generate cooccurence matrix
|
||||
|
||||
# TODO save cooccurence matrix
|
||||
Loading…
Add table
Add a link
Reference in a new issue