wpa/R/personas_hclust.R

151 строка
4.0 KiB
R

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
#' @title Create hierarchical clusters of selected metrics using a Person query
#'
#' @description
#' `r lifecycle::badge('questioning')`
#' Apply hierarchical clustering to selected metrics. Person averages are computed prior to clustering.
#' The hierarchical clustering uses cosine distance and the ward.D method
#' of agglomeration.
#'
#'
#' @param data A data frame containing `PersonId` and selected metrics for clustering.
#' @param metrics Character vector containing names of metrics to use for clustering. See examples section.
#' @param k Numeric vector to specify the `k` number of clusters to cut by.
#' @param return Character vector to specify what to return.
#' Valid options include:
#' - "plot": return
#' - "data": return raw data with clusters appended
#' - "table": return summary table for identified clusters
#' - "hclust": return hierarchical model (an `hclust` object) generated by the function.
#'
#' @import dplyr
#' @import tidyselect
#' @import ggplot2
#' @importFrom proxy dist
#' @importFrom stats hclust
#' @importFrom stats rect.hclust
#' @importFrom stats cutree
#' @importFrom tidyr replace_na
#'
#' @examples
#' # Return plot
#' personas_hclust(sq_data,
#' metrics = c("Collaboration_hours", "Workweek_span"),
#' k = 4)
#'
#' # Return summary table
#' personas_hclust(sq_data,
#' metrics = c("Collaboration_hours", "Workweek_span"),
#' k = 4,
#' return = "table")
#'
#' \donttest{
#' # Return data with clusters appended
#' personas_hclust(sq_data,
#' metrics = c("Collaboration_hours", "Workweek_span"),
#' k = 4,
#' return = "data")
#' }
#'
#' @family Work Patterns
#'
#' @export
personas_hclust <- function(data,
metrics,
k = 4,
return = "plot"){
## Use names for matching
input_var <- metrics
## transform the data for clusters
data_cluster <-
data %>%
select(PersonId, input_var) %>%
group_by(PersonId) %>%
summarise_at(vars(input_var), ~mean(., na.rm = TRUE), .groups = "drop")
## Run hclust
h_clust <-
data_cluster %>%
select(input_var) %>%
proxy::dist(method = "cosine") %>%
stats::hclust(method = "ward.D")
## Cut tree
cuts <- stats::cutree(h_clust, k = k)
## Bind cut tree to data frame
data_final <-
data_cluster%>%
select(PersonId) %>%
cbind("cluster" = cuts) %>%
left_join(data, by = "PersonId")
## Return
if(return == "data"){
return(data_final)
} else if(return == "table"){
## Count table
count_tb <-
data_final %>%
group_by(cluster) %>%
summarise(n = n()) %>%
mutate(prop = n / sum(n))
## Summary statistics
sums_tb <-
data_final %>%
group_by(cluster) %>%
summarise_if(is.numeric,function(x) round(mean(x),1))
count_tb %>%
left_join(sums_tb, by = "cluster") %>%
return()
} else if(return =="plot"){
## Unique person count
## Print count string
count_tb_p <-
data_final %>%
hrvar_count(hrvar = "cluster", return = "table") %>%
arrange(cluster) %>%
mutate(print_str = paste0("cluster ", cluster, " = ", n)) %>%
pull(print_str) %>%
paste(collapse = "; ")
## Use keymetrics_scan() to visualize clusters
data_final %>%
mutate(cluster = factor(cluster)) %>%
keymetrics_scan(hrvar = "cluster") +
labs(title = "Key metrics by personas clusters",
caption = paste(count_tb_p, "\n",
extract_date_range(data, return = "text")))
} else if(return == "hclust"){
return(h_clust)
} else {
stop("Invalid input for `return`.")
}
}