## In this assignment we will use R to create Word Clouds based on two speeches. ## The first is Reagan's 1981 inaugural address and the second is Trump's 2017 inaugural address. # Each convention speech will be treated as a corpus, or together our `corpora'. ## First the required packages: install.packages('tm', repos='http://cran.us.r-project.org') # text mining package for prepping the corpus install.packages('wordcloud', repos='http://cran.us.r-project.org') # the wordcloud package library(tm) library(wordcloud) ## Reagan's 1981 inaugural address is from the American Presidency Project: ## http://www.presidency.ucsb.edu/ws/index.php?pid=43130 ## The speech is in a textfile. Make sure you have 'reaganspeech.txt' downloaded to your working directory. ### If you haven't already, save this Assignment2.R file --- and the text files --- to the same directory. ### Then on the RStudio drop-dowb menu, go to Session --> Set Working Directory --> To Source File Location getwd() # list the working directory dir() # list contents of the current working directory ## Below we use the readLines() function to read the lines of the speech and assign the contents of the speech to ## an R object, called Reagan. Reagan <- readLines(con="reaganspeech.txt") ## You might see a warning message about an incomplete final line or the absence of an end of line delimiter, but that's OK. ## Note: Run the name of the Reagan object in R to observe what it looks like: Reagan ## We use the wordcloud() function to run the word cloud and observe the results! wordcloud(Reagan, scale=c(3,0.5), max.words=100, random.order =FALSE, rot.per=.20) ## Click `Zoom' to see the cloud a bit bigger! ## A closer look at the options. Change the values to observe the results. ## scale=c() specifies the difference in font size from smallest to largest text. ## max.words= specifies the number of words to display ## random.order=FALSE specifies an ordered apearance. Try TRUE ## rot.per=.20 specifies a proportion of the words rotated. Range is 0 to 1. ## See the helpfile for wordcloud for a more in-depth explanation: help(wordcloud) ## We'll try difference colors: ## We will alter the colors with another package, RColorBrewer ## You should have already seen this message: ## 'Loading required package: RColorBrewer' ## This means it is already installed. If it isn't, use: ## install.packages("RColorBrewer", repos='http://cran.us.r-project.org') ## library(RColorBrewer) ## color cheat sheet: ## https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf ## brewer.pal() function sets number of colors and palette ## I like first 5 colors from Set1: wordcloud(Reagan, min.freq=2, max.words=100, random.order = FALSE, rot.per=.20, scale=c(2,0.5), colors=brewer.pal(5, 'Set1')) ## With the colors=brewer.pal(5, 'Set1') option, we select 5 colors to plot, from 'Set1'. See the cheat sheet ## for different colors, or just use Set1 as I did. ## There's also one additional option set, the minimum frequency of words to plot ## min.freq=2 ## This means that only words mentioned at least two times will be included in the word cloud ## min.freq=5 would mean only words mentioned at least five times would be included ## Now on to Trump Trump<-readLines(con="trumpspeech.txt") ## Again, ignore the Warning message. # same wordcloud settings as Reagan: wordcloud(Trump, min.freq=2, max.words=100, random.order = FALSE, rot.per=.20, colors=brewer.pal(5, 'Set1')) ## These word clouds are generated automatically. ### Notice that the word ''will'' appears in the ## Below we clean the corpus with tm() package. We want to learn how to prepare a `corpus' for analysis. ## When we study the Federalist Papers later in the semester, these skills will be important to understand. ## With more data cleaning prior to plotting, we can get better results for the wordcloud. # Using Corpus function to create Reagan speech corpus for tm() package: Reagancorp<-Corpus(VectorSource(Reagan)) # Corpus() reads in the text in the required format ## We perform a series of functions on the corpus to prepare it for analysis. Reagancorp <- tm_map(Reagancorp, content_transformer(tolower)) # converting all characters to lower case ## Next we will remove all `stopwords' from the corpus. Stopwords are the most commonly used words, but less illustrative of ## themes in text. A list of stopwords is here: http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop ## `Stopwords' are also referred to as 'function words'. ## A short explanation is here: ## http://www.myenglishpages.com/site_php_files/vocabulary-lesson-function-words.php # Using list of English stopwords from tm package, plus the word "will" Reagancorp <- tm_map(Reagancorp, removeWords, c(stopwords("english"), "will")) # taking out punctuation marks Reagancorp <- tm_map(Reagancorp, removePunctuation) Reagancorp<-tm_map(Reagancorp, stripWhitespace) ReagancorpTD<-TermDocumentMatrix(Reagancorp) ReagancorpTD ## Now, prior to plotting the new word cloud, we construct a data matrix. matrix <- as.matrix(ReagancorpTD) v <- sort(rowSums(matrix),decreasing=TRUE) d <- data.frame(word = names(v),freq=v) # Look at the contents -- two columns, one titled 'word' the other 'freq' for frequency head(d, 10) # first 10 rows head(d, 100) ## Reagan's word cloud below. Notice any differences from the prior cloud? wordcloud(words = d$word, freq = d$freq, min.freq = 2, max.words=100, random.order=FALSE, rot.per=0.20, scale=c(2,0.5), colors=brewer.pal(5, "Set1")) #### Now we repeat the same steps, but with Trump. ## Now on to Trump with tm() package Trumpcorp<-Corpus(VectorSource(Trump)) Trumpcorp <- tm_map(Trumpcorp, content_transformer(tolower)) # converting to lower case Trumpcorp<-tm_map(Trumpcorp, removeWords, c(stopwords("english"), "will")) Trumpcorp<-tm_map(Trumpcorp, removePunctuation) Trumpcorp<-tm_map(Trumpcorp, stripWhitespace) TrumpcorpTD<-TermDocumentMatrix(Trumpcorp) matrix2 <- as.matrix(TrumpcorpTD) v2 <- sort(rowSums(matrix2),decreasing=TRUE) d2 <- data.frame(word = names(v2),freq=v2) head(d2, 10) set.seed(369) wordcloud(words = d2$word, freq = d2$freq, min.freq = 2, max.words=100, random.order=FALSE, rot.per=0.20, scale=c(2,0.5), colors=brewer.pal(5, "Set1")) ###########################################################################