## In this assignment we will use R to create Word Clouds based on two speeches.
## The first is Reagan's 1981 inaugural address and the second is Trump's 2017 inaugural address.

# Each convention speech will be treated as a corpus, or together our `corpora'.  

## First the required packages: 
install.packages('tm', repos='http://cran.us.r-project.org') # text mining package for prepping the corpus
install.packages('wordcloud', repos='http://cran.us.r-project.org') # the wordcloud package
library(tm) 
library(wordcloud)

## Reagan's 1981 inaugural address is from the American Presidency Project: 
## http://www.presidency.ucsb.edu/ws/index.php?pid=43130

## The speech is in a textfile. Make sure you have 'reaganspeech.txt' downloaded to your working directory.
### If you haven't already, save this Assignment2.R file --- and the text files --- to the same directory.
### Then on the RStudio drop-dowb menu, go to Session --> Set Working Directory --> To Source File Location

getwd() # list the working directory

dir() # list contents of the current working directory

## Below we use the readLines() function to read the lines of the speech and assign the contents of the speech to 
##    an R object, called Reagan.

Reagan <- readLines(con="reaganspeech.txt")
## You might see a warning message about an incomplete final line or the absence of an end of line delimiter, but that's OK.

## Note: Run the name of the Reagan object in R to observe what it looks like: 
Reagan

## We use the wordcloud() function to run the word cloud and observe the results!
wordcloud(Reagan, scale=c(3,0.5), max.words=100, random.order =FALSE, rot.per=.20)

## Click `Zoom' to see the cloud a bit bigger!

## A closer look at the options. Change the values to observe the results.
## scale=c() specifies the difference in font size from smallest to largest text.  
## max.words= specifies the number of words to display
## random.order=FALSE specifies an ordered apearance. Try TRUE
## rot.per=.20 specifies a proportion of the words rotated.  Range is 0 to 1. 

## See the helpfile for wordcloud for a more in-depth explanation:
help(wordcloud)


## We'll try difference colors: 

## We will alter the colors with another package, RColorBrewer
## You should have already seen this message: 
## 'Loading required package: RColorBrewer'
## This means it is already installed. If it isn't, use:
## install.packages("RColorBrewer", repos='http://cran.us.r-project.org')
## library(RColorBrewer)
## color cheat sheet: 
## https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf

## brewer.pal() function sets number of colors and palette
## I like first 5 colors from Set1:
wordcloud(Reagan, min.freq=2, max.words=100, random.order = FALSE, rot.per=.20, scale=c(2,0.5), colors=brewer.pal(5, 'Set1'))

## With the colors=brewer.pal(5, 'Set1') option, we select 5 colors to plot, from 'Set1'. See the cheat sheet
## for different colors, or just use Set1 as I did. 

## There's also one additional option set, the minimum frequency of words to plot
## min.freq=2 
## This means that only words mentioned at least two times will be included in the word cloud
## min.freq=5 would mean only words mentioned at least five times would be included

## Now on to Trump
Trump<-readLines(con="trumpspeech.txt")
## Again, ignore the Warning message.

# same wordcloud settings as Reagan:
wordcloud(Trump, min.freq=2, max.words=100, random.order = FALSE, rot.per=.20, colors=brewer.pal(5, 'Set1'))

## These word clouds are generated automatically.

### Notice that the word ''will'' appears in the 

## Below we clean the corpus with tm() package. We want to learn how to prepare a `corpus' for analysis. 
## When we study the Federalist Papers later in the semester, these skills will be important to understand.

## With more data cleaning prior to plotting, we can get better results for the wordcloud.

# Using Corpus function to create Reagan speech corpus for tm() package:
Reagancorp<-Corpus(VectorSource(Reagan)) # Corpus() reads in the text in the required format

## We perform a series of functions on the corpus to prepare it for analysis.  

Reagancorp <- tm_map(Reagancorp, content_transformer(tolower)) # converting all characters to lower case 

## Next we will remove all `stopwords' from the corpus. Stopwords are the most commonly used words, but less illustrative of 
## themes in text.  A list of stopwords is here: http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop
## `Stopwords' are also referred to as 'function words'. 
## A short explanation is here:
## http://www.myenglishpages.com/site_php_files/vocabulary-lesson-function-words.php


# Using list of English stopwords from tm package, plus the word "will"
Reagancorp <- tm_map(Reagancorp, removeWords, c(stopwords("english"), "will"))

# taking out punctuation marks
Reagancorp <- tm_map(Reagancorp, removePunctuation)
Reagancorp<-tm_map(Reagancorp, stripWhitespace)
ReagancorpTD<-TermDocumentMatrix(Reagancorp)

ReagancorpTD

## Now, prior to plotting the new word cloud, we construct a data matrix.  

matrix <- as.matrix(ReagancorpTD)
v <- sort(rowSums(matrix),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

# Look at the contents -- two columns, one titled 'word' the other 'freq' for frequency
head(d, 10) # first 10 rows

head(d, 100)

## Reagan's word cloud below. Notice any differences from the prior cloud? 
wordcloud(words = d$word, freq = d$freq, min.freq = 2,
          max.words=100, random.order=FALSE, rot.per=0.20, scale=c(2,0.5),
          colors=brewer.pal(5, "Set1"))

#### Now we repeat the same steps, but with Trump.

## Now on to Trump with tm() package
Trumpcorp<-Corpus(VectorSource(Trump))
Trumpcorp <- tm_map(Trumpcorp, content_transformer(tolower)) # converting to lower case 
Trumpcorp<-tm_map(Trumpcorp, removeWords, c(stopwords("english"), "will"))
Trumpcorp<-tm_map(Trumpcorp, removePunctuation)
Trumpcorp<-tm_map(Trumpcorp, stripWhitespace)
TrumpcorpTD<-TermDocumentMatrix(Trumpcorp)

matrix2 <- as.matrix(TrumpcorpTD)
v2 <- sort(rowSums(matrix2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2, 10)

set.seed(369)
wordcloud(words = d2$word, freq = d2$freq, min.freq = 2,
          max.words=100, random.order=FALSE, rot.per=0.20, scale=c(2,0.5),
          colors=brewer.pal(5, "Set1"))
###########################################################################