wpa/R/tm_wordcloud.R

88 строки
2.8 KiB
R
Исходник Обычный вид История

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
2021-03-31 16:05:23 +03:00
#' @title Generate a wordcloud with meeting subject lines
2020-10-27 00:21:24 +03:00
#'
#' @description
#' Generate a wordcloud with the meeting query.
#' This is a sub-function that feeds into `meeting_tm_report()`.
#'
#' @details
2021-03-31 16:05:23 +03:00
#' Uses the 'ggwordcloud' package for the underlying implementation, thus
#' returning a 'ggplot' object. Additional layers can be added onto the plot
#' using a ggplot `+` syntax.
2020-10-27 00:21:24 +03:00
#' The recommendation is not to return over 100 words in a word cloud.
#'
#' @details
#' This function uses `tm_clean()` as the underlying data wrangling function.
#' There is an option to remove stopwords by passing a data frame into the
#' `stopwords` argument.
2020-10-27 00:21:24 +03:00
#'
#' @param data A Meeting Query dataset in the form of a data frame.
#' @param stopwords A single-column data frame labelled `'word'` containing
#' custom stopwords to remove.
2020-10-27 00:21:24 +03:00
#' @param seed A numeric vector to set seed for random generation.
#' @param keep A numeric vector specifying maximum number of words to keep.
2021-03-31 16:05:23 +03:00
#' @param return String specifying what to return. This must be one of the
#' following strings:
#' - `"plot"`
#' - `"table"`
#'
#' See `Value` for more information.
#' @param ... Additional parameters to be passed to
#' `ggwordcloud::geom_text_wordcloud()`
2020-10-27 00:21:24 +03:00
#'
2021-03-31 16:05:23 +03:00
#' @return
#' A different output is returned depending on the value passed to the `return`
#' argument:
#' - `"plot"`: 'ggplot' object containing a word cloud.
#' - `"table"`: data frame returning the data used to generate the word cloud.
#'
2020-10-27 00:21:24 +03:00
#' @import dplyr
#' @examples
#' tm_wordcloud(mt_data, keep = 30)
#'
#' @family Text-mining
#'
#' @export
tm_wordcloud <- function(data,
stopwords = NULL,
seed = 100,
keep = 100,
return = "plot",
...){
set.seed(seed)
clean_data <- suppressMessages(tm_clean(data = data, token = "words", stopwords = stopwords))
plot_data <-
clean_data %>% # Remove additional stop words
count(word, name = "freq") %>%
arrange(desc(freq))
if(nrow(plot_data) < keep){
keep <- nrow(plot_data)
}
plot_data <- plot_data %>% slice(1:keep)
if(return == "plot"){
output <-
plot_data %>%
ggplot(aes(label = word, size = freq)) +
2021-07-28 16:43:01 +03:00
ggwordcloud::geom_text_wordcloud(rm_outside = TRUE, ...) +
scale_size_area(max_size = 15) +
2020-10-27 00:21:24 +03:00
theme_minimal()
return(output)
} else if (return == "table"){
return(plot_data)
} else {
stop("Please enter a valid input for `return`.")
}
}