2020-11-11 01:02:22 +03:00
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
|
2021-03-31 16:05:23 +03:00
|
|
|
#' @title Generate a wordcloud with meeting subject lines
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
|
|
|
#' @description
|
|
|
|
#' Generate a wordcloud with the meeting query.
|
|
|
|
#' This is a sub-function that feeds into `meeting_tm_report()`.
|
|
|
|
#'
|
|
|
|
#' @details
|
2021-03-31 16:05:23 +03:00
|
|
|
#' Uses the 'ggwordcloud' package for the underlying implementation, thus
|
|
|
|
#' returning a 'ggplot' object. Additional layers can be added onto the plot
|
|
|
|
#' using a ggplot `+` syntax.
|
2020-10-27 00:21:24 +03:00
|
|
|
#' The recommendation is not to return over 100 words in a word cloud.
|
|
|
|
#'
|
|
|
|
#' @details
|
|
|
|
#' This function uses `tm_clean()` as the underlying data wrangling function.
|
2021-03-05 17:05:00 +03:00
|
|
|
#' There is an option to remove stopwords by passing a data frame into the
|
|
|
|
#' `stopwords` argument.
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
|
|
|
#' @param data A Meeting Query dataset in the form of a data frame.
|
2021-03-05 17:05:00 +03:00
|
|
|
#' @param stopwords A single-column data frame labelled `'word'` containing
|
|
|
|
#' custom stopwords to remove.
|
2020-10-27 00:21:24 +03:00
|
|
|
#' @param seed A numeric vector to set seed for random generation.
|
|
|
|
#' @param keep A numeric vector specifying maximum number of words to keep.
|
2021-03-31 16:05:23 +03:00
|
|
|
#' @param return String specifying what to return. This must be one of the
|
|
|
|
#' following strings:
|
|
|
|
#' - `"plot"`
|
|
|
|
#' - `"table"`
|
|
|
|
#'
|
|
|
|
#' See `Value` for more information.
|
2021-03-05 17:05:00 +03:00
|
|
|
#' @param ... Additional parameters to be passed to
|
|
|
|
#' `ggwordcloud::geom_text_wordcloud()`
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
2021-03-31 16:05:23 +03:00
|
|
|
#' @return
|
|
|
|
#' A different output is returned depending on the value passed to the `return`
|
|
|
|
#' argument:
|
|
|
|
#' - `"plot"`: 'ggplot' object containing a word cloud.
|
|
|
|
#' - `"table"`: data frame returning the data used to generate the word cloud.
|
|
|
|
#'
|
2020-10-27 00:21:24 +03:00
|
|
|
#' @import dplyr
|
|
|
|
#' @examples
|
|
|
|
#' tm_wordcloud(mt_data, keep = 30)
|
|
|
|
#'
|
|
|
|
#' @family Text-mining
|
|
|
|
#'
|
|
|
|
#' @export
|
|
|
|
|
|
|
|
tm_wordcloud <- function(data,
|
|
|
|
stopwords = NULL,
|
|
|
|
seed = 100,
|
|
|
|
keep = 100,
|
|
|
|
return = "plot",
|
|
|
|
...){
|
|
|
|
|
|
|
|
set.seed(seed)
|
|
|
|
|
|
|
|
clean_data <- suppressMessages(tm_clean(data = data, token = "words", stopwords = stopwords))
|
|
|
|
|
|
|
|
plot_data <-
|
|
|
|
clean_data %>% # Remove additional stop words
|
|
|
|
count(word, name = "freq") %>%
|
|
|
|
arrange(desc(freq))
|
|
|
|
|
|
|
|
if(nrow(plot_data) < keep){
|
|
|
|
keep <- nrow(plot_data)
|
|
|
|
}
|
|
|
|
|
|
|
|
plot_data <- plot_data %>% slice(1:keep)
|
|
|
|
|
|
|
|
if(return == "plot"){
|
|
|
|
output <-
|
|
|
|
plot_data %>%
|
|
|
|
ggplot(aes(label = word, size = freq)) +
|
|
|
|
ggwordcloud::geom_text_wordcloud(rm_outside = TRUE, area_corr = 12, ...) +
|
|
|
|
theme_minimal()
|
|
|
|
|
|
|
|
return(output)
|
|
|
|
} else if (return == "table"){
|
|
|
|
return(plot_data)
|
|
|
|
} else {
|
|
|
|
stop("Please enter a valid input for `return`.")
|
|
|
|
}
|
|
|
|
}
|