2021-07-09 13:55:12 +03:00
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
#' @title
|
|
|
|
#' Count top words in subject lines grouped by a custom attribute
|
|
|
|
#'
|
|
|
|
#' @description `r lifecycle::badge('experimental')`
|
|
|
|
#'
|
|
|
|
#' This function generates a matrix of the top occurring words in meetings,
|
|
|
|
#' grouped by a specified attribute such as organisational attribute, day of the
|
|
|
|
#' week, or hours of the day.
|
|
|
|
#'
|
|
|
|
#' @param data A Meeting Query dataset in the form of a data frame.
|
|
|
|
#' @param hrvar String containing the name of the HR Variable by which to split
|
2021-07-28 16:14:30 +03:00
|
|
|
#' metrics. Note that the prefix `'Organizer_'` or equivalent will be
|
|
|
|
#' required.
|
2021-07-13 15:19:57 +03:00
|
|
|
#' @param mode String specifying what variable to use for grouping subject
|
|
|
|
#' words. Valid values include:
|
|
|
|
#' - `"hours"`
|
|
|
|
#' - `"days"`
|
|
|
|
#' - `NULL` (defaults to `hrvar`)
|
|
|
|
#' When the value passed to `mode` is not `NULL`, the value passed to `hrvar`
|
|
|
|
#' will be discarded and instead be over-written by setting specified in `mode`.
|
2021-07-13 13:57:40 +03:00
|
|
|
#' @param top_n Numeric value specifying the top number of words to show.
|
|
|
|
#' @inheritParams tm_clean
|
2021-07-13 12:27:32 +03:00
|
|
|
#' @param return String specifying what to return. This must be one of the
|
|
|
|
#' following strings:
|
|
|
|
#' - `"plot"`
|
|
|
|
#' - `"table"`
|
|
|
|
#' - `"data"`
|
|
|
|
#'
|
|
|
|
#' See `Value` for more information.
|
2021-07-13 13:57:40 +03:00
|
|
|
#' @param weight String specifying the column name of a numeric variable for
|
|
|
|
#' weighting data, such as `"Invitees"`. The column must contain positive
|
|
|
|
#' integers. Defaults to `NULL`, where no weighting is applied.
|
2021-08-27 16:41:26 +03:00
|
|
|
#' @param stopwords A single-column data frame labelled `'word'` containing
|
|
|
|
#' custom stopwords to remove.
|
2021-07-13 13:57:40 +03:00
|
|
|
#' @param ... Additional parameters to pass to `tm_clean()`.
|
2021-07-13 12:27:32 +03:00
|
|
|
#'
|
|
|
|
#' @return
|
|
|
|
#' A different output is returned depending on the value passed to the `return`
|
|
|
|
#' argument:
|
|
|
|
#' - `"plot"`: 'ggplot' object. A heatmapped grid.
|
|
|
|
#' - `"table"`: data frame. A summary table for the metric.
|
|
|
|
#' - `"data"`: data frame.
|
2021-07-09 13:55:12 +03:00
|
|
|
#'
|
2021-07-13 11:52:27 +03:00
|
|
|
#' @import dplyr
|
2021-07-13 16:56:59 +03:00
|
|
|
#' @import ggplot2
|
2021-07-13 11:52:27 +03:00
|
|
|
#'
|
2021-07-09 13:55:12 +03:00
|
|
|
#' @examples
|
2021-07-20 13:16:13 +03:00
|
|
|
#' # return a heatmap table for words
|
2021-07-13 16:15:52 +03:00
|
|
|
#' mt_data %>% subject_scan(hrvar = "Organizer_Organization")
|
2021-07-09 13:55:12 +03:00
|
|
|
#'
|
2021-07-20 13:16:13 +03:00
|
|
|
#' # return a heatmap table for ngrams
|
|
|
|
#' mt_data %>%
|
|
|
|
#' subject_scan(
|
|
|
|
#' hrvar = "Organizer_Organization",
|
|
|
|
#' token = "ngrams",
|
|
|
|
#' n = 2)
|
|
|
|
#'
|
|
|
|
#' # return raw table format
|
|
|
|
#' mt_data %>% subject_scan(hrvar = "Organizer_Organization", return = "table")
|
|
|
|
#'
|
|
|
|
#' # grouped by hours
|
2021-07-30 11:23:20 +03:00
|
|
|
#' mt_data %>% subject_scan(mode = "hours")
|
2021-07-20 13:16:13 +03:00
|
|
|
#'
|
|
|
|
#' # grouped by days
|
2021-07-30 11:23:20 +03:00
|
|
|
#' mt_data %>% subject_scan(mode = "days")
|
2021-07-20 13:16:13 +03:00
|
|
|
#'
|
2021-07-09 13:55:12 +03:00
|
|
|
#' @export
|
2021-07-13 12:27:32 +03:00
|
|
|
subject_scan <- function(data,
|
|
|
|
hrvar,
|
2021-07-13 15:19:57 +03:00
|
|
|
mode = NULL,
|
2021-07-13 13:57:40 +03:00
|
|
|
top_n = 10,
|
|
|
|
token = "words",
|
|
|
|
return = "plot",
|
|
|
|
weight = NULL,
|
2021-08-27 16:41:26 +03:00
|
|
|
stopwords = NULL,
|
2021-07-13 13:57:40 +03:00
|
|
|
...){
|
|
|
|
|
|
|
|
# weighting -------------------------------------------------------
|
|
|
|
|
|
|
|
if(!is.null(weight)){
|
|
|
|
|
|
|
|
d_weight <- data[[weight]]
|
|
|
|
|
|
|
|
if(any(is.na(d_weight) | d_weight <= 0 | d_weight %% 1 != 0)){
|
|
|
|
|
|
|
|
stop("Please check 'weight' variable.")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
# duplicate rows according to numeric weight
|
|
|
|
# numeric weight must be an integer
|
|
|
|
data_w <- data[rep(seq_len(nrow(data)), d_weight),]
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
data_w <- data
|
|
|
|
|
|
|
|
}
|
2021-07-09 13:55:12 +03:00
|
|
|
|
2021-07-13 15:19:57 +03:00
|
|
|
# modes -----------------------------------------------------------
|
|
|
|
|
|
|
|
if(is.null(mode)){
|
|
|
|
|
|
|
|
# do nothing
|
|
|
|
|
|
|
|
} else if(mode == "hours"){
|
|
|
|
|
2021-07-28 16:07:42 +03:00
|
|
|
# Default variable in meeting query
|
|
|
|
StartTimeUTC <- NULL
|
|
|
|
|
2021-07-13 15:19:57 +03:00
|
|
|
data_w <-
|
|
|
|
data_w %>%
|
|
|
|
mutate(HourOfDay = substr(StartTimeUTC, start = 1, stop = 2) %>%
|
|
|
|
as.numeric()) %>%
|
|
|
|
mutate(HourOfDay =
|
|
|
|
case_when(HourOfDay > 19 ~ "After 7PM",
|
|
|
|
HourOfDay >= 17 ~ "5 - 7 PM",
|
|
|
|
HourOfDay >= 14 ~ "2 - 5 PM",
|
|
|
|
HourOfDay >= 11 ~ "11AM - 2 PM",
|
|
|
|
HourOfDay >= 9 ~ "9 - 11 AM",
|
|
|
|
TRUE ~ "Before 9 AM"
|
|
|
|
) %>%
|
|
|
|
factor(
|
|
|
|
levels = c(
|
|
|
|
"Before 9 AM",
|
|
|
|
"9 - 11 AM",
|
|
|
|
"11AM - 2 PM",
|
|
|
|
"2 - 5 PM",
|
|
|
|
"5 - 7 PM",
|
|
|
|
"After 7PM"
|
|
|
|
)
|
|
|
|
))
|
|
|
|
|
|
|
|
hrvar <- "HourOfDay"
|
|
|
|
|
|
|
|
} else if(mode == "days"){
|
|
|
|
|
2021-07-28 16:07:42 +03:00
|
|
|
# Variable in meeting data
|
|
|
|
StartDate <- NULL
|
|
|
|
|
2021-07-13 15:19:57 +03:00
|
|
|
data_w <-
|
|
|
|
data_w %>%
|
2021-07-20 13:16:13 +03:00
|
|
|
mutate(DayOfWeek = weekdays(StartDate) %>%
|
|
|
|
factor(
|
|
|
|
levels = c(
|
|
|
|
"Sunday",
|
|
|
|
"Monday",
|
|
|
|
"Tuesday",
|
|
|
|
"Wednesday",
|
|
|
|
"Thursday",
|
|
|
|
"Friday",
|
|
|
|
"Saturday"
|
|
|
|
)
|
|
|
|
))
|
|
|
|
|
2021-07-13 15:19:57 +03:00
|
|
|
|
|
|
|
hrvar <- "DayOfWeek"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-07-13 11:52:27 +03:00
|
|
|
# long table -------------------------------------------------------
|
|
|
|
|
|
|
|
out_tb_long <-
|
2021-07-13 13:57:40 +03:00
|
|
|
data_w %>%
|
2021-07-09 13:55:12 +03:00
|
|
|
group_split(!!sym(hrvar)) %>%
|
|
|
|
purrr::map(function(x){
|
|
|
|
|
|
|
|
dow <- x[[hrvar]][1]
|
|
|
|
|
2021-08-27 16:41:26 +03:00
|
|
|
long_t <- tm_clean(
|
|
|
|
x,
|
|
|
|
token = token,
|
|
|
|
stopwords = stopwords,
|
|
|
|
...) %>%
|
2021-07-13 13:57:40 +03:00
|
|
|
filter(!is.na(word))
|
2021-07-09 13:55:12 +03:00
|
|
|
|
|
|
|
long_t %>%
|
|
|
|
count(word) %>%
|
|
|
|
arrange(desc(n)) %>%
|
2021-07-28 16:07:42 +03:00
|
|
|
utils::head(top_n) %>%
|
2021-07-13 11:52:27 +03:00
|
|
|
mutate(group = dow)
|
|
|
|
}) %>%
|
|
|
|
bind_rows()
|
|
|
|
|
|
|
|
# wide table -------------------------------------------------------
|
|
|
|
|
|
|
|
out_tb_wide <-
|
|
|
|
out_tb_long %>%
|
|
|
|
group_split(group) %>%
|
|
|
|
purrr::map(function(x){
|
|
|
|
|
|
|
|
dow <- x[["group"]][1]
|
|
|
|
|
|
|
|
x %>%
|
2021-07-09 13:55:12 +03:00
|
|
|
rename(
|
|
|
|
!!sym(paste0(dow, "_word")) := "word",
|
|
|
|
!!sym(paste0(dow, "_n")) := "n"
|
2021-07-13 11:52:27 +03:00
|
|
|
) %>%
|
|
|
|
select(-group)
|
2021-07-09 13:55:12 +03:00
|
|
|
}) %>%
|
2021-07-09 14:40:09 +03:00
|
|
|
bind_cols()
|
2021-07-09 13:55:12 +03:00
|
|
|
|
2021-07-13 12:27:32 +03:00
|
|
|
# return simple table -----------------------------------------------
|
|
|
|
|
2021-07-09 13:55:12 +03:00
|
|
|
out_simple <-
|
2021-07-13 12:27:32 +03:00
|
|
|
out_tb_wide %>%
|
2021-07-09 13:55:12 +03:00
|
|
|
select(-ends_with("_n")) %>%
|
2021-07-13 16:56:59 +03:00
|
|
|
purrr::set_names(nm = gsub(pattern = "_word", replacement = "",
|
2021-07-09 13:55:12 +03:00
|
|
|
x = names(.)))
|
|
|
|
|
2021-07-13 12:27:32 +03:00
|
|
|
# return chunk -------------------------------------------------------
|
|
|
|
|
|
|
|
if(return == "plot"){
|
|
|
|
|
|
|
|
out_tb_long %>%
|
|
|
|
mutate(n = maxmin(n)) %>%
|
|
|
|
arrange(desc(n)) %>%
|
|
|
|
group_by(group) %>%
|
|
|
|
mutate(id = 1:n()) %>%
|
|
|
|
ungroup() %>%
|
|
|
|
ggplot(aes(x = group, y = id)) +
|
|
|
|
geom_tile(aes(fill = n)) +
|
2021-07-13 15:19:57 +03:00
|
|
|
geom_text(aes(label = word), size = 3) +
|
2021-07-13 12:27:32 +03:00
|
|
|
scale_fill_gradient2(low = rgb2hex(7, 111, 161),
|
|
|
|
mid = rgb2hex(241, 204, 158),
|
|
|
|
high = rgb2hex(216, 24, 42),
|
|
|
|
midpoint = 0.5,
|
|
|
|
breaks = c(0, 0.5, 1),
|
2021-07-20 13:16:13 +03:00
|
|
|
labels = c("Low", "", "High"),
|
|
|
|
limits = c(0, 1),
|
|
|
|
name = "Frequency") +
|
2021-07-13 12:27:32 +03:00
|
|
|
scale_x_discrete(position = "top") +
|
|
|
|
scale_y_reverse() +
|
|
|
|
theme_wpa_basic() +
|
|
|
|
theme(axis.text.x = element_text(angle = 45, hjust = 0),
|
|
|
|
plot.title = element_text(color="grey40", face="bold", size=20),
|
|
|
|
axis.text.y = element_blank()) +
|
|
|
|
labs(
|
2021-07-28 13:44:59 +03:00
|
|
|
title = "Top terms",
|
|
|
|
subtitle = paste("By", camel_clean(hrvar)),
|
|
|
|
y = "Top terms by frequency in Subject",
|
|
|
|
x = ""
|
2021-07-13 12:27:32 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
} else if(return == "table"){
|
|
|
|
|
|
|
|
out_simple
|
|
|
|
|
2021-07-28 16:14:30 +03:00
|
|
|
} else if(return == "data"){
|
|
|
|
|
|
|
|
out_tb_wide
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
stop("Invalid input to return.")
|
|
|
|
|
2021-07-13 12:27:32 +03:00
|
|
|
}
|
2021-07-09 13:55:12 +03:00
|
|
|
}
|
2021-07-20 13:03:59 +03:00
|
|
|
|
|
|
|
#' @rdname subject_scan
|
|
|
|
#' @export
|
|
|
|
tm_scan <- subject_scan
|