copbird_aufarbeitung/ergebnisse_hackathon_repo/team-16/r-scripts/Presse vs. Twitter.Rmd

216 lines
7.6 KiB
Text
Raw Normal View History

2023-03-26 16:36:49 +00:00
---
title: "Team 16"
author: "Christian, Simon und Cuca"
date: "23 5 2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Daten einlesen
```{r, message = FALSE}
library(tidyverse)
library(stringi)
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)
tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
```
# Scrapen der Pressemeldungen (seit Dezember 2020)
# Zuordnung von Orten der Pressemeldungen und Tweets
```{r}
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
```
# Anzahl Pressemeldungen vs. Tweets
```{r}
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)
land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
facet_wrap(~Plattform) +
coord_flip() +
guides(fill = FALSE) +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
ggplot(land_pm_tw) +
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
coord_flip() +
labs(title = "Anzahl der Pressemeldungen und Tweets",
subtitle = "Im Zeitraum April bis Mai 2021") +
theme_minimal()
```
# Topic modelling
```{r, message=FALSE}
# library(quanteda)
# library(tidyverse)
# library(topicmodels)
# library(ldatuning)
# library(stm)
# library(wordcloud)
#
# pm <- pm[!is.na(pm$content), ]
# tok <- tokens(pm$content_ber_satzzeichen)
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
# # mydfm.trim
#
# anzahl.themen <- 10
# anzahl.woerter <- 10
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
# lda.modell
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
# topmod
#
# write_csv(topmod, "data/topicmodel.csv")
```
### Auswahl der Keywords
`topic_1 = ['demonstr', 'kundgeb']`
`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
# Sentiment Analyse
```{r}
readAndflattenSentiWS <- function(filename) {
words = readLines(filename, encoding="UTF-8")
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
words <- unlist(strsplit(words, ","))
words <- tolower(words)
return(words)
}
pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/positive-words.txt"))
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),
readAndflattenSentiWS("SentiWS/negative-words.txt"))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# I don't just want a TRUE/FALSE! How can I do this?
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
},
pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_demo) +
geom_bar(aes(x = score), fill = "blue") +
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
ggplot(score_pm_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_drogen) +
geom_bar(aes(x = score), fill = "darkgreen") +
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
ggplot(score_pm_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
theme_minimal()
ggplot(score_tw_rass) +
geom_bar(aes(x = score), fill = "purple") +
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
theme_minimal()
```
```{r}
sessionInfo()
```