---
title: "Team 16"
author: "Christian, Simon und Cuca"
date: "23 5 2021"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Daten einlesen
```{r, message = FALSE}
library(tidyverse)
library(stringi)

pm_csv <- str_c("data/2020-12-", 1:26, "_presseportal.csv")
pm_csv <- c(pm_csv, str_c("data/2021-1-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-2-", 1:28, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-3-", 1:31, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-4-", 1:30, "_presseportal.csv"))
pm_csv <- c(pm_csv, str_c("data/2021-5-", 1:21, "_presseportal.csv"))
pm_list <- lapply(pm_csv, read_csv)
pm <- do.call(rbind, pm_list)

tweets <- read_csv("data/copbird_table_tweet.csv")
tweets <- tweets[tweets$created_at >= "2021-04-01", 1:4]
usersX <- read_csv("data/copbird_table_user_ext.csv")
tweetXstate <- read_csv("data/copbird_table_tweet_ext_state.csv")
blaulicht <- read_csv("data/2020-12_2021-05_presseportal.csv")

pm_demo <- read_csv("data/copbird_table_pm_topiced_demonstr.csv")
tw_demo <- read_csv("data/copbird_table_tweet_topiced_demonstr.csv")

pm_drogen <- read_csv("data/copbird_table_pm_topiced_drogen.csv")
tw_drogen <- read_csv("data/copbird_table_tweet_topiced_drogen.csv")

pm_rass <- read_csv("data/copbird_table_pm_topiced_rassis.csv")
tw_rass <- read_csv("data/copbird_table_tweet_topiced_rassis.csv")
```


# Scrapen der Pressemeldungen (seit Dezember 2020)

# Zuordnung von Orten der Pressemeldungen und Tweets
```{r}
head(usersX)
head(tweetXstate[, 5:8])
blaulicht$tw_user_id <- as.character(blaulicht$tw_user_id)
head(blaulicht[, -c(2, 5)])
```

# Anzahl Pressemeldungen vs. Tweets
```{r}
land_tw <- full_join(tweets, usersX[c(1, 4)], by = "user_id")
land_tw$bundesland[land_tw$bundesland == "-"] <- NA_character_
land_tw <- land_tw %>% group_by(bundesland) %>% count()
land_tw$bundesland <- as.factor(land_tw$bundesland)

land_pm <- pm %>% group_by(bundesland) %>% count()
land_pm$bundesland[land_pm$bundesland == "berlin-brandenburg"] <- "berlin"
land_pm$bundesland <- stri_trans_totitle(land_pm$bundesland)
land_pm$bundesland <- gsub("ue", "ü", land_pm$bundesland)
land_pm$bundesland <- factor(land_pm$bundesland, levels = levels(land_tw$bundesland))

land_pm_tw <- full_join(land_pm, land_tw, by = "bundesland")
names(land_pm_tw)[2:3] <- c("Pressemeldung", "Twitter")
land_pm_tw <- land_pm_tw[-which(is.na(land_pm_tw$bundesland)), ]
land_pm_tw$Pressemeldung[which(is.na(land_pm_tw$Pressemeldung))] <- 0
land_pm_tw <- gather(land_pm_tw, key = "Plattform", value = "count", -bundesland)

ggplot(land_pm_tw) +
  geom_col(aes(x = bundesland, y = count, fill = Plattform)) +
  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
  facet_wrap(~Plattform) +
  coord_flip() +
  guides(fill = FALSE) +
  labs(title = "Anzahl der Pressemeldungen und Tweets", 
       subtitle = "Im Zeitraum April bis Mai 2021") +
  theme_minimal()

ggplot(land_pm_tw) +
  geom_col(aes(x = bundesland, y = count, fill = Plattform), position = "fill") +
  scale_fill_manual(values = c("#CC6699", "#0099CC")) +
  coord_flip() +
  labs(title = "Anzahl der Pressemeldungen und Tweets", 
       subtitle = "Im Zeitraum April bis Mai 2021") +
  theme_minimal()
```

# Topic modelling
```{r, message=FALSE}
# library(quanteda)
# library(tidyverse)
# library(topicmodels)
# library(ldatuning)
# library(stm)
# library(wordcloud)
# 
# pm <- pm[!is.na(pm$content), ]
# tok <- tokens(pm$content_ber_satzzeichen)
# mydfm <- dfm(tok, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
# mydfm.trim <-  dfm_trim(mydfm, min_docfreq = 3, max_docfreq = 65)
# # mydfm.trim
# 
# anzahl.themen <- 10
# anzahl.woerter <- 10
# dfm2topicmodels <- convert(mydfm.trim, to = "topicmodels")
# lda.modell <- LDA(dfm2topicmodels, anzahl.themen)
# lda.modell
# topmod <- as.data.frame(terms(lda.modell, anzahl.woerter))
# topmod
# 
# write_csv(topmod, "data/topicmodel.csv")
```

### Auswahl der Keywords
`topic_1 = ['demonstr', 'kundgeb']`

`topic_2 = ['drogen', 'weed', 'graas', 'lsd', 'cannabis', 'ecstasy', 'kokain', 'meth', 'crystal']`

`topic_3 = ['rassis', 'diskriminier', 'ausländerfeindlich', 'fremdenfeindlich', 'fremdenhass']`

`topic_4 = ['antisem', 'juden', 'synagoge', 'judenhass', 'judenfeindlich', 'holocaust']`


# Sentiment Analyse
```{r}
readAndflattenSentiWS <- function(filename) { 
  words = readLines(filename, encoding="UTF-8")
  words <- sub("\\|[A-Z]+\t[0-9.-]+\t?", ",", words)
  words <- unlist(strsplit(words, ","))
  words <- tolower(words)
  return(words)
}

pos.words <- c(scan("SentiWS/positive-words.txt",what='character', comment.char=';', quiet=T), 
               readAndflattenSentiWS("SentiWS/positive-words.txt"))
neg.words <- c(scan("SentiWS/negative-words.txt",what='character', comment.char=';', quiet=T), 
              readAndflattenSentiWS("SentiWS/negative-words.txt"))

score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
  require(plyr)
  require(stringr)
  scores = laply(sentences, function(sentence, pos.words, neg.words) 
  {
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    # match() returns the position of the matched term or NA
    # I don't just want a TRUE/FALSE! How can I do this?
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    return(score)
  }, 
  pos.words, neg.words, .progress=.progress )
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

score_pm_demo <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_demo <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)

ggplot(score_pm_demo) +
  geom_bar(aes(x = score), fill = "blue") +
  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_demo) +
  geom_bar(aes(x = score), fill = "blue") +
  labs(title = "Topic: Demonstrationen", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()

score_pm_drogen <- score.sentiment(pm_demo$content, pos.words, neg.words)
score_tw_drogen <- score.sentiment(tw_demo$tweet_text, pos.words, neg.words)

ggplot(score_pm_drogen) +
  geom_bar(aes(x = score), fill = "darkgreen") +
  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_drogen) +
  geom_bar(aes(x = score), fill = "darkgreen") +
  labs(title = "Topic: Drogen", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()

score_pm_rass <- score.sentiment(pm_rass$content, pos.words, neg.words)
score_tw_rass <- score.sentiment(tw_rass$tweet_text, pos.words, neg.words)

ggplot(score_pm_rass) +
  geom_bar(aes(x = score), fill = "purple") +
  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Pressemeldungen") +
  theme_minimal()

ggplot(score_tw_rass) +
  geom_bar(aes(x = score), fill = "purple") +
  labs(title = "Topic: Rassismus", subtitle = "Sentiment-Analyse der Tweets") +
  theme_minimal()
```

```{r}
sessionInfo()
```