## This script makes a word cloud. ## Steps are adapted from: ## https://georeferenced.wordpress.com/2013/01/15/rwordcloud/ ## http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know # We'll use the packages "tm" and "wordcloud" # If they're not already installed, be sure to install them library(tm) library(wordcloud) # I have all the text I'm interested in in a folder called "WordCloudText" # (Word clouds can be created for as many or as few documents as you need) # Use the command below to load everything in the directory # into a "corpus" (collection of documents) # Note that this assumes I am already in the directory one above # the folder called "WordCloudText" and am now using relative paths dat_orig <- Corpus(DirSource("WordCloudText/")) # What does dat_orig look like? str(dat_orig) # To check out the corpus, use inspect() inspect(dat_orig) # Next, we want to clean up the text # Renaming my data dat <- dat_orig # Strip whitespace dat <- tm_map(dat, stripWhitespace) # Convert words to lowercase dat <- tm_map(dat, tolower) # Remove stop words (i.e., "the", "a", "an", "yet", "so") dat <- tm_map(dat, removeWords, stopwords("english")) # Text stemming - converts derived words to their base # (e.g., fishing/fished/fisher --> fish) # has some hiccups (argues/arguing --> argu, which isn't what we want) # dat <- tm_map(dat, stemDocument) # Remove punctuation dat <- tm_map(dat, removePunctuation) # Remove numbers dat <- tm_map(dat, removeNumbers) # Turn the dat corpus into a term document matrix dat_tdm <- TermDocumentMatrix(dat) dat_tdm str(dat_tdm) # Turn term document matrix into true matrix dat_m <- as.matrix(dat_tdm) head(dat_m) # Find the row sums (i.e. total number of times a word is used); this will be a vector dat_vec <- rowSums(dat_m) head(dat_vec) # Sort this vector so most abundant terms are first dat_vec_sort <- sort(dat_vec, decreasing=TRUE) head(dat_vec_sort) # Turn this into a data frame, with names of vector as "word" and "freq" as value str(dat_vec_sort) dat_ready <- data.frame(word = names(dat_vec_sort),freq=dat_vec_sort) head(dat_ready, 10) # # ----> Alternative method using the tidyverse/tidytext <----# library(dplyr) library(tidytext) # First, make the term document matrix "tidy" dat_tidy <- tidy(dat_tdm) head(dat_tidy) # Next, we group by the term and find the sum of count for that term # We then arrange so that the most abundant terms are first dat_tidy_sums <- dat_tidy %>% group_by(term) %>% summarize(word_sums = sum(count, na.rm = TRUE)) %>% arrange(desc(word_sums)) %>% as.data.frame() head(dat_tidy_sums) # # ----> end alternative tidyverse/tidytext method <----# # Now we build our word cloud with greater control # First, use set.seed so each time we make the word cloud it is identical set.seed(1234) # Make the word cloud, adjusting the defaults for greater control # Example: set minimum frequency so getting words that are used >= 15 times # See options using ?wordcloud wordcloud(words = dat_ready$word, freq = dat_ready$freq, min.freq = 15, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2")) # This is looking pretty good but I see some words I don't want in the cloud # These slipped through earlier, but I can manually remove them # Going back to dat and using removeWords # (Note: if you're comfortable with the tidyverse, you could use filter() to # remove these words from dat_ready) dat <- tm_map(dat, removeWords, c("included", "provided", "many", "higher", "used", "using", "also", "within", "across", "can", "will", "may", "one", "including", "doc", "multiple", "unique", "example", "comid", "need", "among", "lower", "density", "primary", "however", "set", "overall", "tdn", "low", "cover", "sampleid", "hampton", "tblresultscsv", "nhdplusv", "well")) # Remake the term document matrix, real matrix, vector, etc. to get dataframe dat_tdm <- TermDocumentMatrix(dat) dat_m <- as.matrix(dat_tdm) dat_vec <- rowSums(dat_m) dat_vec_sort <- sort(dat_vec, decreasing=TRUE) dat_ready <- data.frame(word = names(dat_vec_sort),freq=dat_vec_sort) head(dat_ready, 10) # Try making the word cloud again, adjusting a few settings set.seed(20) wordcloud(words = dat_ready$word, freq = dat_ready$freq, min.freq = 12,max.words=125, random.order=FALSE, rot.per=0.40, colors=brewer.pal(8, "Dark2")) # Play with settings until get a cloud you like # Can adjust colors, min.freq, max.words, rotation percentage, etc. # Finally, save to png png("recent_ms_word_cloud.png", width = 4, height = 4, units = "in", res = 400) set.seed(20) wordcloud(words = dat_ready$word, freq = dat_ready$freq, min.freq = 12,max.words=100, random.order=FALSE, rot.per=0.40, #family = "serif", font = 3, colors=brewer.pal(8, "Dark2")) dev.off() # I can change the font using the family and font # within the wordcloud command # Ideas for next steps: fit a word cloud to a specified shape # https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html