copbird_aufarbeitung/ergebnisse_hackathon_repo/team-16/r-scripts/Presse vs. Twitter.Rmd

---
title: "Team 16"
author: "Christian, Simon und Cuca"
date: "23 5 2021"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Daten einlesen
```{r, message = FALSE}
library(tidyverse)
library(stringi)

pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)

tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")

pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")

pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")

pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
```


# Scrapen der Pressemeldungen (seit Dezember 2020)

# Zuordnung von Orten der Pressemeldungen und Tweets
```{r}
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
```

# Anzahl Pressemeldungen vs. Tweets
```{r}
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)

land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))

land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)

ggplot(land_pm_tw) +
  geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
  facet_wrap(~Plattform) +
  coord_flip() +
  guides(fill = FALSE) +
  labs(title = "Anzahl der Pressemeldungen und Tweets", 
       subtitle = "Im Zeitraum April bis Mai 2021") +
  theme_minimal()

ggplot(land_pm_tw) +
  geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
  coord_flip() +
  labs(title = "Anzahl der Pressemeldungen und Tweets", 
       subtitle = "Im Zeitraum April bis Mai 2021") +
  theme_minimal()
```

# Topic modelling
```{r, message=FALSE}
# library(quanteda)
# library(tidyverse)
# library(topicmodels)
# library(ldatuning)
# library(stm)
# library(wordcloud)
# 
# pm <- pm[!is.na(pm$content), ]
# tok <- tokens(pm$content_ber_satzzeichen)
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
# mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
# # mydfm.trim
# 
# anzahl.themen <- 10
# anzahl.woerter <- 10
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
# lda.modell
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
# topmod
# 
# write_csv(topmod, "data/topicmodel.csv")
```

### Auswahl der Keywords
`topic_1 = ['demonstr', 'kundgeb']`

`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`

`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`

`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`


# Sentiment Analyse
```{r}
readAndflattenSentiWS <- function(filename) { 
  words = readLines(filename, encoding="UTF-8")
  words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
  words <- unlist(strsplit(words, ","))
  words <- tolower(words)
  return(words)
}

pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T), 
               readAndflattenSentiWS("SentiWS/positive-words.txt"))
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T), 
              readAndflattenSentiWS("SentiWS/negative-words.txt"))

score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
  require(plyr)
  require(stringr)
  scores = laply(sentences, function(sentence, pos.words, neg.words) 
  {
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    # match() returns the position of the matched term or NA
    # I don't just want a TRUE/FALSE! How can I do this?
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    return(score)
  }, 
  pos.words, neg.words, .progress=.progress )
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)

ggplot(score_pm_demo) +
  geom_bar(aes(x = score), fill = "blue") +
  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_demo) +
  geom_bar(aes(x = score), fill = "blue") +
  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()

score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)

ggplot(score_pm_drogen) +
  geom_bar(aes(x = score), fill = "darkgreen") +
  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_drogen) +
  geom_bar(aes(x = score), fill = "darkgreen") +
  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()

score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)

ggplot(score_pm_rass) +
  geom_bar(aes(x = score), fill = "purple") +
  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_rass) +
  geom_bar(aes(x = score), fill = "purple") +
  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()
```

```{r}
sessionInfo()
```
init 2023-03-26 16:36:49 +00:00			`---`
			`title: "Team 16"`
			`author: "Christian, Simon und Cuca"`
			`date: "23 5 2021"`
			`output: pdf_document`
			`---`

			```{r setup, include=FALSE}
			`knitr::opts_chunk$set(echo = TRUE)`
			```

			`# Daten einlesen`
			```{r, message = FALSE}
			`library(tidyverse)`
			`library(stringi)`

			`pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")`
			`pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))`
			`pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))`
			`pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))`
			`pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))`
			`pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))`
			`pm_list <- lapply(pm_csv, read_csv)`
			`pm <- do.call(rbind, pm_list)`

			`tweets <- read_csv("data/copbird_table_tweet.csv")`
			`tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]`
			`usersX <- read_csv("data/copbird_table_user_ext.csv")`
			`tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")`
			`blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")`

			`pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")`
			`tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")`

			`pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")`
			`tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")`

			`pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")`
			`tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")`
			```


			`# Scrapen der Pressemeldungen (seit Dezember 2020)`

			`# Zuordnung von Orten der Pressemeldungen und Tweets`
			```{r}
			`head(usersX)`
			`head(tweetXstate[, 5:8])`
			`blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)`
			`head(blaulicht[, -c(2, 5)])`
			```

			`# Anzahl Pressemeldungen vs. Tweets`
			```{r}
			`land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")`
			`land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_`
			`land_tw <- land_tw %>% group_by(bundesland) %>% count()`
			`land_tw$bundesland <- as.factor(land_tw$bundesland)`

			`land_pm <- pm %>% group_by(bundesland) %>% count()`
			`land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"`
			`land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)`
			`land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)`
			`land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))`

			`land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")`
			`names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")`
			`land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]`
			`land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0`
			`land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)`

			`ggplot(land_pm_tw) +`
			`geom_col(aes(x = bundesland, y = count, fill = Plattform)) +`
			`scale_fill_manual(values = c("#CC6699", "#0099CC")) +`
			`facet_wrap(~Plattform) +`
			`coord_flip() +`
			`guides(fill = FALSE) +`
			`labs(title = "Anzahl der Pressemeldungen und Tweets",`
			`subtitle = "Im Zeitraum April bis Mai 2021") +`
			`theme_minimal()`

			`ggplot(land_pm_tw) +`
			`geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +`
			`scale_fill_manual(values = c("#CC6699", "#0099CC")) +`
			`coord_flip() +`
			`labs(title = "Anzahl der Pressemeldungen und Tweets",`
			`subtitle = "Im Zeitraum April bis Mai 2021") +`
			`theme_minimal()`
			```

			`# Topic modelling`
			```{r, message=FALSE}
			`# library(quanteda)`
			`# library(tidyverse)`
			`# library(topicmodels)`
			`# library(ldatuning)`
			`# library(stm)`
			`# library(wordcloud)`
			`#`
			`# pm <- pm[!is.na(pm$content), ]`
			`# tok <- tokens(pm$content_ber_satzzeichen)`
			`# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))`
			`# mydfm.trim <- dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)`
			`# # mydfm.trim`
			`#`
			`# anzahl.themen <- 10`
			`# anzahl.woerter <- 10`
			`# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")`
			`# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)`
			`# lda.modell`
			`# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))`
			`# topmod`
			`#`
			`# write_csv(topmod, "data/topicmodel.csv")`
			```

			`### Auswahl der Keywords`
			`topic_1 = ['demonstr', 'kundgeb']`

			`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`

			`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`

			`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`



			`# Sentiment Analyse`
			```{r}
			`readAndflattenSentiWS <- function(filename) {`
			`words = readLines(filename, encoding="UTF-8")`
			`words <- sub("\\\|[A-Z]+\t[0-9.-]+\t?", ",", words)`
			`words <- unlist(strsplit(words, ","))`
			`words <- tolower(words)`
			`return(words)`
			`}`

			`pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T),`
			`readAndflattenSentiWS("SentiWS/positive-words.txt"))`
			`neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T),`
			`readAndflattenSentiWS("SentiWS/negative-words.txt"))`

			`score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {`
			`require(plyr)`
			`require(stringr)`
			`scores = laply(sentences, function(sentence, pos.words, neg.words)`
			`{`
			`# clean up sentences with R's regex-driven global substitute, gsub():`
			`sentence = gsub('[[:punct:]]', '', sentence)`
			`sentence = gsub('[[:cntrl:]]', '', sentence)`
			`sentence = gsub('\\d+', '', sentence)`
			`# and convert to lower case:`
			`sentence = tolower(sentence)`
			`# split into words. str_split is in the stringr package`
			`word.list = str_split(sentence, '\\s+')`
			`# sometimes a list() is one level of hierarchy too much`
			`words = unlist(word.list)`
			`# compare our words to the dictionaries of positive & negative terms`
			`pos.matches = match(words, pos.words)`
			`neg.matches = match(words, neg.words)`
			`# match() returns the position of the matched term or NA`
			`# I don't just want a TRUE/FALSE! How can I do this?`
			`pos.matches = !is.na(pos.matches)`
			`neg.matches = !is.na(neg.matches)`
			`# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():`
			`score = sum(pos.matches) - sum(neg.matches)`
			`return(score)`
			`},`
			`pos.words, neg.words, .progress=.progress )`
			`scores.df = data.frame(score=scores, text=sentences)`
			`return(scores.df)`
			`}`

			`score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)`
			`score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)`

			`ggplot(score_pm_demo) +`
			`geom_bar(aes(x = score), fill = "blue") +`
			`labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +`
			`theme_minimal()`

			`ggplot(score_tw_demo) +`
			`geom_bar(aes(x = score), fill = "blue") +`
			`labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +`
			`theme_minimal()`

			`score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)`
			`score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)`

			`ggplot(score_pm_drogen) +`
			`geom_bar(aes(x = score), fill = "darkgreen") +`
			`labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +`
			`theme_minimal()`

			`ggplot(score_tw_drogen) +`
			`geom_bar(aes(x = score), fill = "darkgreen") +`
			`labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +`
			`theme_minimal()`

			`score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)`
			`score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)`

			`ggplot(score_pm_rass) +`
			`geom_bar(aes(x = score), fill = "purple") +`
			`labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +`
			`theme_minimal()`

			`ggplot(score_tw_rass) +`
			`geom_bar(aes(x = score), fill = "purple") +`
			`labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +`
			`theme_minimal()`
			```

			```{r}
			`sessionInfo()`
			```