--- title: "Team 16" author: "Christian, Simon und Cuca" date: "23 5 2021" output: pdf_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Daten einlesen ```{r, message = FALSE} library(tidyverse) library(stringi) pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv") pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv")) pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv")) pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv")) pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv")) pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv")) pm_list <- lapply(pm_csv, read_csv) pm <- do.call(rbind, pm_list) tweets <- read_csv("data/copbird_table_tweet.csv") tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4] usersX <- read_csv("data/copbird_table_user_ext.csv") tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv") blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv") pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv") tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv") pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv") tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv") pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv") tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv") ``` # Scrapen der Pressemeldungen (seit Dezember 2020) # Zuordnung von Orten der Pressemeldungen und Tweets ```{r} head(usersX) head(tweetXstate[, 5:8]) blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id) head(blaulicht[, -c(2, 5)]) ``` # Anzahl Pressemeldungen vs. Tweets ```{r} land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id") land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_ land_tw <- land_tw %>% group_by(bundesland) %>% count() land_tw$bundesland <- as.factor(land_tw$bundesland) land_pm <- pm %>% group_by(bundesland) %>% count() land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin" land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland) land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland) land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland)) land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland") names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter") land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ] land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0 land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland) ggplot(land_pm_tw) + geom_col(aes(x = bundesland, y = count, fill = Plattform)) + scale_fill_manual(values = c("#CC6699", "#0099CC")) + facet_wrap(~Plattform) + coord_flip() + guides(fill = FALSE) + labs(title = "Anzahl der Pressemeldungen und Tweets", subtitle = "Im Zeitraum April bis Mai 2021") + theme_minimal() ggplot(land_pm_tw) + geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") + scale_fill_manual(values = c("#CC6699", "#0099CC")) + coord_flip() + labs(title = "Anzahl der Pressemeldungen und Tweets", subtitle = "Im Zeitraum April bis Mai 2021") + theme_minimal() ``` # Topic modelling ```{r, message=FALSE} # library(quanteda) # library(tidyverse) # library(topicmodels) # library(ldatuning) # library(stm) # library(wordcloud) # # pm <- pm[!is.na(pm$content), ] # tok <- tokens(pm$content_ber_satzzeichen) # mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german")) # mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65) # # mydfm.trim # # anzahl.themen <- 10 # anzahl.woerter <- 10 # dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels") # lda.modell <- LDA(dfm2topicmodels, anzahl.themen) # lda.modell # topmod <- as.data.frame(terms(lda.modell, anzahl.woerter)) # topmod # # write_csv(topmod, "data/topicmodel.csv") ``` ### Auswahl der Keywords `topic_1 = ['demonstr', 'kundgeb']` `topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']` `topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']` `topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']` # Sentiment Analyse ```{r} readAndflattenSentiWS <- function(filename) { words = readLines(filename, encoding="UTF-8") words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words) words <- unlist(strsplit(words, ",")) words <- tolower(words) return(words) } pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T), readAndflattenSentiWS("SentiWS/positive-words.txt")) neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T), readAndflattenSentiWS("SentiWS/negative-words.txt")) score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { require(plyr) require(stringr) scores = laply(sentences, function(sentence, pos.words, neg.words) { # clean up sentences with R's regex-driven global substitute, gsub(): sentence = gsub('[[:punct:]]', '', sentence) sentence = gsub('[[:cntrl:]]', '', sentence) sentence = gsub('\\d+', '', sentence) # and convert to lower case: sentence = tolower(sentence) # split into words. str_split is in the stringr package word.list = str_split(sentence, '\\s+') # sometimes a list() is one level of hierarchy too much words = unlist(word.list) # compare our words to the dictionaries of positive & negative terms pos.matches = match(words, pos.words) neg.matches = match(words, neg.words) # match() returns the position of the matched term or NA # I don't just want a TRUE/FALSE! How can I do this? pos.matches = !is.na(pos.matches) neg.matches = !is.na(neg.matches) # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): score = sum(pos.matches) - sum(neg.matches) return(score) }, pos.words, neg.words, .progress=.progress ) scores.df = data.frame(score=scores, text=sentences) return(scores.df) } score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words) score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) ggplot(score_pm_demo) + geom_bar(aes(x = score), fill = "blue") + labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") + theme_minimal() ggplot(score_tw_demo) + geom_bar(aes(x = score), fill = "blue") + labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") + theme_minimal() score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words) score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words) ggplot(score_pm_drogen) + geom_bar(aes(x = score), fill = "darkgreen") + labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") + theme_minimal() ggplot(score_tw_drogen) + geom_bar(aes(x = score), fill = "darkgreen") + labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") + theme_minimal() score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words) score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words) ggplot(score_pm_rass) + geom_bar(aes(x = score), fill = "purple") + labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") + theme_minimal() ggplot(score_tw_rass) + geom_bar(aes(x = score), fill = "purple") + labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") + theme_minimal() ``` ```{r} sessionInfo() ```