wpa/R/create_boxplot.R

160 строки
5.1 KiB
R

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
#' @title Box Plot for any metric
#'
#' @description
#' Analyzes a selected metric and returns a a 'fizzy' scatter plot by default.
#' Additional options available to return a table with distribution elements.
#'
#' @details
#' This is a general purpose function that powers all the functions
#' in the package that produce box plots.
#'
#' @param data A Standard Person Query dataset in the form of a data frame.
#' @param metric Character string containing the name of the metric,
#' e.g. "Collaboration_hours"
#' @param hrvar HR Variable by which to split metrics. Accepts a character vector, defaults to "Organization" but accepts any character vector, e.g. "LevelDesignation"
#' @param mingroup Numeric value setting the privacy threshold / minimum group size, defaults to 5.
#' @param return Character vector specifying what to return, defaults to "plot".
#' Valid inputs are "plot" and "table".
#'
#' @import dplyr
#' @import ggplot2
#' @import reshape2
#' @import scales
#' @importFrom stats median
#' @importFrom stats sd
#'
#' @family General
#'
#' @examples
#' ## Create a fizzy plot for Work Week Span by Level Designation
#' create_boxplot(sq_data, metric = "Workweek_span", hrvar = "LevelDesignation", return = "plot")
#'
#' ## Create a summary statistics table for Work Week Span by Organization
#' create_boxplot(sq_data, metric = "Workweek_span", hrvar = "Organization", return = "table")
#'
#' ## Create a fizzy plot for Collaboration Hours by Level Designation
#' create_boxplot(sq_data, metric = "Collaboration_hours", hrvar = "LevelDesignation", return = "plot")
#' @export
create_boxplot <- function(data,
metric,
hrvar = "Organization",
mingroup = 5,
return = "plot") {
## Check inputs
required_variables <- c("Date",
metric,
"PersonId")
## Error message if variables are not present
## Nothing happens if all present
data %>%
check_inputs(requirements = required_variables)
## Handling NULL values passed to hrvar
if(is.null(hrvar)){
data <- totals_col(data)
hrvar <- "Total"
}
## Clean metric name
clean_nm <- us_to_space(metric)
plot_data <-
data %>%
rename(group = !!sym(hrvar)) %>% # Rename HRvar to `group`
group_by(PersonId, group) %>%
summarise(!!sym(metric) := mean(!!sym(metric))) %>%
ungroup() %>%
left_join(data %>%
rename(group = !!sym(hrvar)) %>%
group_by(group) %>%
summarise(Employee_Count = n_distinct(PersonId)),
by = "group") %>%
filter(Employee_Count >= mingroup)
## Get max value
max_point <- max(plot_data[[metric]]) * 1.2
plot_legend <-
plot_data %>%
group_by(group) %>%
summarize(Employee_Count = first(Employee_Count)) %>%
mutate(Employee_Count = paste("n=",Employee_Count))
## summary table
summary_table <-
plot_data %>%
select(group, tidyselect::all_of(metric)) %>%
group_by(group) %>%
summarise(mean = mean(!!sym(metric)),
median = median(!!sym(metric)),
sd = sd(!!sym(metric)),
min = min(!!sym(metric)),
max = max(!!sym(metric)),
range = max - min,
n = n())
## group order
group_ord <-
summary_table %>%
arrange(desc(mean)) %>%
pull(group)
plot_object <-
plot_data %>%
mutate(group = factor(group, levels = group_ord)) %>%
ggplot(aes(x = group, y = !!sym(metric))) +
geom_boxplot(color = "#578DB8") +
ylim(0, max_point) +
annotate("text", x = plot_legend$group, y = 0, label = plot_legend$Employee_Count) +
scale_x_discrete(labels = scales::wrap_format(10)) +
theme_classic() +
theme(axis.text=element_text(size=12),
axis.text.x = element_text(angle = 30, hjust = 1),
plot.title = element_text(color="grey40", face="bold", size=18),
plot.subtitle = element_text(size=14),
legend.position = "top",
legend.justification = "right",
legend.title=element_text(size=14),
legend.text=element_text(size=14)) +
labs(title = clean_nm,
subtitle = paste("Distribution of",
tolower(clean_nm),
"by",
camel_clean(hrvar))) +
xlab(hrvar) +
ylab(paste("Average", clean_nm)) +
labs(caption = extract_date_range(data, return = "text"))
if(return == "table"){
summary_table %>%
as_tibble() %>%
return()
} else if(return == "plot"){
return(plot_object)
} else if(return == "data"){
plot_data %>%
mutate(group = factor(group, levels = group_ord)) %>%
arrange(desc(group))
} else {
stop("Please enter a valid input for `return`.")
}
}