init
This commit is contained in:
commit
8d3c8b3974
77 changed files with 682928 additions and 0 deletions
512
ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
Normal file
512
ergebnisse_hackathon_repo/team-16/r-scripts/.Rhistory
Normal file
|
|
@ -0,0 +1,512 @@
|
|||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
|
||||
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
|
||||
pm_text <- pm$content
|
||||
pm_text <- pm_text[-which(is.na(pm_text))] # remove missing values
|
||||
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
|
||||
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
|
||||
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
|
||||
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
|
||||
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
|
||||
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
|
||||
content_ber <- rep(NA, nrow(pm))
|
||||
content_ber <- rep(NA, nrow(pm))
|
||||
content_ber[which(!is.na(pm$content))] <- pm_text
|
||||
content_ber[which(!is.na(pm$content))] <- pm_text
|
||||
pm <- cbind(pm, content_ber)
|
||||
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
|
||||
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
|
||||
content_ber_satzzeichen <- rep(NA, nrow(pm))
|
||||
content_ber_satzzeichen <- rep(NA, nrow(pm))
|
||||
content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
|
||||
pm <- cbind(pm, content_ber_satzzeichen)
|
||||
head(pm)
|
||||
pm_text <- pm_demo$content
|
||||
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
|
||||
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
|
||||
content_ber <- rep(NA, nrow(pm_demo))
|
||||
content_ber[which(!is.na(pm_demo$content))] <- pm_text
|
||||
pm_demo <- cbind(pm_demo, content_ber)
|
||||
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
|
||||
content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
|
||||
content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
|
||||
pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
|
||||
head(pm_demo)
|
||||
readAndflattenSentiWS <- function(filename) {
|
||||
words = readLines(filename, encoding="UTF-8")
|
||||
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
|
||||
words <- unlist(strsplit(words, ","))
|
||||
words <- tolower(words)
|
||||
return(words)
|
||||
}
|
||||
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/positive-words.txt"))
|
||||
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/negative-words.txt"))
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
|
||||
{
|
||||
#require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# we just want a TRUE/FALSE:
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
|
||||
{
|
||||
#require(plyr)
|
||||
require(stringr)
|
||||
scores = lapply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# we just want a TRUE/FALSE:
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
|
||||
{
|
||||
#require(plyr)
|
||||
require(stringr)
|
||||
scores = lapply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# we just want a TRUE/FALSE:
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
library(plyr)
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
|
||||
{
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = lapply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# we just want a TRUE/FALSE:
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
|
||||
{
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = lapply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# we just want a TRUE/FALSE:
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# I don't just want a TRUE/FALSE! How can I do this?
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
ggplot(score_tw_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
View(score_tw_demo)
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
library(tidyverse)
|
||||
library(stringi)
|
||||
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
|
||||
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
|
||||
pm_text <- pm$content
|
||||
pm_text <- pm_text[-which(is.na(pm_text))] # remove missing values
|
||||
length(grep("(ots)", pm_text)) == length(pm_text) # every report contains "ots"
|
||||
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
|
||||
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
|
||||
content_ber <- rep(NA, nrow(pm))
|
||||
content_ber[which(!is.na(pm$content))] <- pm_text
|
||||
pm <- cbind(pm, content_ber)
|
||||
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
|
||||
content_ber_satzzeichen <- rep(NA, nrow(pm))
|
||||
content_ber_satzzeichen[which(!is.na(pm$content))] <- pm_text
|
||||
pm <- cbind(pm, content_ber_satzzeichen)
|
||||
head(pm)
|
||||
# csvpath <- <your path>
|
||||
# write_csv(pm, str_c(csvpath, "/pressemeldungen.csv"))
|
||||
pm_text <- pm_demo$content
|
||||
pm_text <- gsub("^(.*?\\(ots\\) - )", "", pm_text, perl = TRUE) # remove <Ort (ots) - >
|
||||
pm_text <- gsub("( \\.\\.\\.)$", "", pm_text) # remove < ...>
|
||||
content_ber <- rep(NA, nrow(pm_demo))
|
||||
content_ber[which(!is.na(pm_demo$content))] <- pm_text
|
||||
pm_demo <- cbind(pm_demo, content_ber)
|
||||
pm_text <- gsub("[^[:alnum:] ]", "", pm_text)
|
||||
content_ber_satzzeichen <- rep(NA, nrow(pm_demo))
|
||||
content_ber_satzzeichen[which(!is.na(pm_demo$content))] <- pm_text
|
||||
pm_demo <- cbind(pm_demo, content_ber_satzzeichen)
|
||||
head(pm_demo)
|
||||
readAndflattenSentiWS <- function(filename) {
|
||||
words = readLines(filename, encoding="UTF-8")
|
||||
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
|
||||
words <- unlist(strsplit(words, ","))
|
||||
words <- tolower(words)
|
||||
return(words)
|
||||
}
|
||||
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/positive-words.txt"))
|
||||
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/negative-words.txt"))
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# I don't just want a TRUE/FALSE! How can I do this?
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
ggplot(score_tw_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
View(score_tw_demo)
|
||||
Ciew(score_pm_demo)
|
||||
View(score_pm_demo)
|
||||
score_pm_demo$text[3]
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
library(tidyverse)
|
||||
library(stringi)
|
||||
# Read in data
|
||||
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
summary(pm)
|
||||
tweets <- read_csv("data/copbird_table_tweet.csv")
|
||||
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
|
||||
usersX <- read_csv("data/copbird_table_user_ext.csv")
|
||||
# tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Oldenburg-Stadt/Ammerl"] <- "Oldenburg"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Mecklenburgische Seenp"] <- "Neubrandenburg"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Wilhelmshaven/Frieslan"] <- "Wilhelmshaven"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Baden-Württember"] <- "Stuttgart"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Landeskriminalamt Rheinland-Pf"] <- "Mainz"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Mitteldeutschlan"] <- "Pirna"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Polizei Delmenhorst/Oldenburg-"] <- "Delmenhorst"
|
||||
# tweetXstate$stadt[tweetXstate$user_name == "Bundespolizei Flughafen Frankf"] <- "Frankfurt"
|
||||
# blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
|
||||
# users <- read_csv("data/copbird_table_user.csv")
|
||||
# str(users)
|
||||
# users$name <- as.factor(users$name)
|
||||
# users$handle <- as.factor(users$handle)
|
||||
pm_orte <- pm %>% group_by(bundesland) %>% count(location)
|
||||
head(pm_orte)
|
||||
head(pm_orte %>% arrange(desc(n)), n = 20)
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
library(tidyverse)
|
||||
library(stringi)
|
||||
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
tweets <- read_csv("data/copbird_table_tweet.csv")
|
||||
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
|
||||
usersX <- read_csv("data/copbird_table_user_ext.csv")
|
||||
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
|
||||
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
|
||||
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
|
||||
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
|
||||
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
|
||||
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
|
||||
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
|
||||
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
|
||||
head(usersX)
|
||||
head(tweetXstate[, 5:8])
|
||||
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
|
||||
head(blaulicht[, -c(2, 5)])
|
||||
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
|
||||
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
|
||||
land_tw <- land_tw %>% group_by(bundesland) %>% count()
|
||||
land_tw$bundesland <- as.factor(land_tw$bundesland)
|
||||
land_pm <- pm %>% group_by(bundesland) %>% count()
|
||||
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
|
||||
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
|
||||
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
|
||||
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
|
||||
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
|
||||
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
|
||||
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
|
||||
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
|
||||
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
facet_wrap(~Plattform) +
|
||||
coord_flip() +
|
||||
guides(fill = FALSE) +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
coord_flip() +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
readAndflattenSentiWS <- function(filename) {
|
||||
words = readLines(filename, encoding="UTF-8")
|
||||
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
|
||||
words <- unlist(strsplit(words, ","))
|
||||
words <- tolower(words)
|
||||
return(words)
|
||||
}
|
||||
pos.words <- c(scan("data/positive-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/positive-words.txt"))
|
||||
neg.words <- c(scan("data/negative-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("data/negative-words.txt"))
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# I don't just want a TRUE/FALSE! How can I do this?
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
ggplot(score_tw_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
ggplot(score_tw_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
|
||||
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
|
||||
ggplot(score_pm_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
ggplot(score_tw_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
---
|
||||
title: "Team 16"
|
||||
author: "Christian, Simon und Cuca"
|
||||
date: "23 5 2021"
|
||||
output: pdf_document
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
# Daten einlesen
|
||||
```{r, message = FALSE}
|
||||
library(tidyverse)
|
||||
library(stringi)
|
||||
|
||||
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
|
||||
tweets <- read_csv("data/copbird_table_tweet.csv")
|
||||
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
|
||||
usersX <- read_csv("data/copbird_table_user_ext.csv")
|
||||
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
|
||||
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
|
||||
|
||||
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
|
||||
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
|
||||
|
||||
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
|
||||
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
|
||||
|
||||
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
|
||||
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
|
||||
```
|
||||
|
||||
|
||||
# Scrapen der Pressemeldungen (seit Dezember 2020)
|
||||
|
||||
# Zuordnung von Orten der Pressemeldungen und Tweets
|
||||
```{r}
|
||||
head(usersX)
|
||||
head(tweetXstate[, 5:8])
|
||||
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
|
||||
head(blaulicht[, -c(2, 5)])
|
||||
```
|
||||
|
||||
# Anzahl Pressemeldungen vs. Tweets
|
||||
```{r}
|
||||
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
|
||||
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
|
||||
land_tw <- land_tw %>% group_by(bundesland) %>% count()
|
||||
land_tw$bundesland <- as.factor(land_tw$bundesland)
|
||||
|
||||
land_pm <- pm %>% group_by(bundesland) %>% count()
|
||||
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
|
||||
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
|
||||
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
|
||||
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
|
||||
|
||||
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
|
||||
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
|
||||
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
|
||||
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
|
||||
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
|
||||
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
facet_wrap(~Plattform) +
|
||||
coord_flip() +
|
||||
guides(fill = FALSE) +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
coord_flip() +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
# Topic modelling
|
||||
```{r, message=FALSE}
|
||||
# library(quanteda)
|
||||
# library(tidyverse)
|
||||
# library(topicmodels)
|
||||
# library(ldatuning)
|
||||
# library(stm)
|
||||
# library(wordcloud)
|
||||
#
|
||||
# pm <- pm[!is.na(pm$content), ]
|
||||
# tok <- tokens(pm$content_ber_satzzeichen)
|
||||
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
|
||||
# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
|
||||
# # mydfm.trim
|
||||
#
|
||||
# anzahl.themen <- 10
|
||||
# anzahl.woerter <- 10
|
||||
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
|
||||
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
|
||||
# lda.modell
|
||||
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
|
||||
# topmod
|
||||
#
|
||||
# write_csv(topmod, "data/topicmodel.csv")
|
||||
```
|
||||
|
||||
### Auswahl der Keywords
|
||||
`topic_1 = ['demonstr', 'kundgeb']`
|
||||
|
||||
`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
|
||||
|
||||
`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
|
||||
|
||||
`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
|
||||
|
||||
|
||||
|
||||
# Sentiment Analyse
|
||||
```{r}
|
||||
readAndflattenSentiWS <- function(filename) {
|
||||
words = readLines(filename, encoding="UTF-8")
|
||||
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
|
||||
words <- unlist(strsplit(words, ","))
|
||||
words <- tolower(words)
|
||||
return(words)
|
||||
}
|
||||
|
||||
pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("SentiWS/positive-words.txt"))
|
||||
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("SentiWS/negative-words.txt"))
|
||||
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# I don't just want a TRUE/FALSE! How can I do this?
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
|
||||
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
|
||||
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
|
||||
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
```{r}
|
||||
sessionInfo()
|
||||
```
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
---
|
||||
title: "Team 16"
|
||||
author: "Christian, Simon und Cuca"
|
||||
date: "23 5 2021"
|
||||
output: pdf_document
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
# Daten einlesen
|
||||
```{r, message = FALSE}
|
||||
library(tidyverse)
|
||||
library(stringi)
|
||||
|
||||
pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
|
||||
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
|
||||
pm_list <- lapply(pm_csv, read_csv)
|
||||
pm <- do.call(rbind, pm_list)
|
||||
|
||||
tweets <- read_csv("data/copbird_table_tweet.csv")
|
||||
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
|
||||
usersX <- read_csv("data/copbird_table_user_ext.csv")
|
||||
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
|
||||
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")
|
||||
|
||||
pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
|
||||
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")
|
||||
|
||||
pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
|
||||
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")
|
||||
|
||||
pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
|
||||
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
|
||||
```
|
||||
|
||||
|
||||
# Scrapen der Pressemeldungen (seit Dezember 2020)
|
||||
|
||||
# Zuordnung von Orten der Pressemeldungen und Tweets
|
||||
```{r}
|
||||
head(usersX)
|
||||
head(tweetXstate[, 5:8])
|
||||
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
|
||||
head(blaulicht[, -c(2, 5)])
|
||||
```
|
||||
|
||||
# Anzahl Pressemeldungen vs. Tweets
|
||||
```{r}
|
||||
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
|
||||
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
|
||||
land_tw <- land_tw %>% group_by(bundesland) %>% count()
|
||||
land_tw$bundesland <- as.factor(land_tw$bundesland)
|
||||
|
||||
land_pm <- pm %>% group_by(bundesland) %>% count()
|
||||
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
|
||||
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
|
||||
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
|
||||
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))
|
||||
|
||||
land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
|
||||
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
|
||||
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
|
||||
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
|
||||
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)
|
||||
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
facet_wrap(~Plattform) +
|
||||
coord_flip() +
|
||||
guides(fill = FALSE) +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(land_pm_tw) +
|
||||
geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
|
||||
scale_fill_manual(values = c("#CC6699", "#0099CC")) +
|
||||
coord_flip() +
|
||||
labs(title = "Anzahl der Pressemeldungen und Tweets",
|
||||
subtitle = "Im Zeitraum April bis Mai 2021") +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
# Topic modelling
|
||||
```{r, message=FALSE}
|
||||
# library(quanteda)
|
||||
# library(tidyverse)
|
||||
# library(topicmodels)
|
||||
# library(ldatuning)
|
||||
# library(stm)
|
||||
# library(wordcloud)
|
||||
#
|
||||
# pm <- pm[!is.na(pm$content), ]
|
||||
# tok <- tokens(pm$content_ber_satzzeichen)
|
||||
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
|
||||
# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
|
||||
# # mydfm.trim
|
||||
#
|
||||
# anzahl.themen <- 10
|
||||
# anzahl.woerter <- 10
|
||||
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
|
||||
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
|
||||
# lda.modell
|
||||
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
|
||||
# topmod
|
||||
#
|
||||
# write_csv(topmod, "data/topicmodel.csv")
|
||||
```
|
||||
|
||||
### Auswahl der Keywords
|
||||
`topic_1 = ['demonstr', 'kundgeb']`
|
||||
|
||||
`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`
|
||||
|
||||
`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`
|
||||
|
||||
`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`
|
||||
|
||||
|
||||
|
||||
# Sentiment Analyse
|
||||
```{r}
|
||||
readAndflattenSentiWS <- function(filename) {
|
||||
words = readLines(filename, encoding="UTF-8")
|
||||
words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
|
||||
words <- unlist(strsplit(words, ","))
|
||||
words <- tolower(words)
|
||||
return(words)
|
||||
}
|
||||
|
||||
pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("SentiWS/positive-words.txt"))
|
||||
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),
|
||||
readAndflattenSentiWS("SentiWS/negative-words.txt"))
|
||||
|
||||
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
|
||||
require(plyr)
|
||||
require(stringr)
|
||||
scores = laply(sentences, function(sentence, pos.words, neg.words)
|
||||
{
|
||||
# clean up sentences with R's regex-driven global substitute, gsub():
|
||||
sentence = gsub('[[:punct:]]', '', sentence)
|
||||
sentence = gsub('[[:cntrl:]]', '', sentence)
|
||||
sentence = gsub('\\d+', '', sentence)
|
||||
# and convert to lower case:
|
||||
sentence = tolower(sentence)
|
||||
# split into words. str_split is in the stringr package
|
||||
word.list = str_split(sentence, '\\s+')
|
||||
# sometimes a list() is one level of hierarchy too much
|
||||
words = unlist(word.list)
|
||||
# compare our words to the dictionaries of positive & negative terms
|
||||
pos.matches = match(words, pos.words)
|
||||
neg.matches = match(words, neg.words)
|
||||
# match() returns the position of the matched term or NA
|
||||
# I don't just want a TRUE/FALSE! How can I do this?
|
||||
pos.matches = !is.na(pos.matches)
|
||||
neg.matches = !is.na(neg.matches)
|
||||
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
|
||||
score = sum(pos.matches) - sum(neg.matches)
|
||||
return(score)
|
||||
},
|
||||
pos.words, neg.words, .progress=.progress )
|
||||
scores.df = data.frame(score=scores, text=sentences)
|
||||
return(scores.df)
|
||||
}
|
||||
|
||||
score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_demo) +
|
||||
geom_bar(aes(x = score), fill = "blue") +
|
||||
labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
|
||||
score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
|
||||
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_drogen) +
|
||||
geom_bar(aes(x = score), fill = "darkgreen") +
|
||||
labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
|
||||
score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
|
||||
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)
|
||||
|
||||
ggplot(score_pm_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
|
||||
theme_minimal()
|
||||
|
||||
ggplot(score_tw_rass) +
|
||||
geom_bar(aes(x = score), fill = "purple") +
|
||||
labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
```{r}
|
||||
sessionInfo()
|
||||
```
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
Loading…
Add table
Add a link
Reference in a new issue