2020-11-11 01:02:22 +03:00
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
|
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
|
2020-10-27 00:21:24 +03:00
|
|
|
#' @title Convert a numeric variable for hours into categorical
|
|
|
|
#'
|
|
|
|
#' @description
|
2021-03-31 13:12:39 +03:00
|
|
|
#' Supply a numeric variable, e.g. `Collaboration_hours`, and return a character
|
|
|
|
#' vector.
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
2021-03-04 21:33:28 +03:00
|
|
|
#' @details
|
|
|
|
#' This is used within `create_dist()` for numeric to categorical conversion.
|
|
|
|
#'
|
2020-10-27 00:21:24 +03:00
|
|
|
#' @param metric A numeric variable representing hours.
|
2021-03-31 13:12:39 +03:00
|
|
|
#' @param cuts A numeric vector of minimum length 3 to represent the
|
|
|
|
#' cut points required. The minimum and maximum values provided in the vector
|
|
|
|
#' are inclusive.
|
2020-11-18 16:59:41 +03:00
|
|
|
#' @param unit String to specify the unit of the labels. Defaults to "hours".
|
2021-04-20 18:16:44 +03:00
|
|
|
#' @param lbound Numeric. Specifies the lower bound (inclusive) value for the
|
|
|
|
#' minimum label. Defaults to 0.
|
|
|
|
#' @param ubound Numeric. Specifies the upper bound (inclusive) value for the
|
|
|
|
#' maximum label. Defaults to 100.
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
2021-03-04 21:33:28 +03:00
|
|
|
#' @family Support
|
2020-10-27 00:21:24 +03:00
|
|
|
#'
|
2021-03-31 13:12:39 +03:00
|
|
|
#' @return
|
|
|
|
#' Character vector representing a converted categorical variable, appended
|
|
|
|
#' with the label of the unit. See `examples` for more information.
|
|
|
|
#'
|
2020-10-27 00:21:24 +03:00
|
|
|
#' @examples
|
2021-02-06 16:49:14 +03:00
|
|
|
#' # Direct use
|
2020-10-27 00:21:24 +03:00
|
|
|
#' cut_hour(1:30, cuts = c(15, 20, 25))
|
|
|
|
#'
|
2021-02-06 16:49:14 +03:00
|
|
|
#' # Use on a query
|
2020-10-27 00:21:24 +03:00
|
|
|
#' cut_hour(sq_data$Collaboration_hours, cuts = c(10, 15, 20))
|
|
|
|
#'
|
|
|
|
#' @export
|
|
|
|
|
2021-04-20 18:16:44 +03:00
|
|
|
cut_hour <- function(metric,
|
|
|
|
cuts,
|
|
|
|
unit = "hours",
|
|
|
|
lbound = 0,
|
|
|
|
ubound = 100){
|
2020-10-27 00:21:24 +03:00
|
|
|
|
|
|
|
|
2020-11-18 12:57:19 +03:00
|
|
|
cuts <- unique(cuts) # No duplicates allowed
|
|
|
|
ncuts <- length(cuts)
|
2020-10-27 00:21:24 +03:00
|
|
|
|
2021-04-20 13:59:51 +03:00
|
|
|
if(ncuts < 2){
|
|
|
|
stop("Please provide a numeric vector of at least length 2 to `cuts`")
|
2020-11-18 12:57:19 +03:00
|
|
|
}
|
2020-10-27 00:21:24 +03:00
|
|
|
|
2020-11-18 12:57:19 +03:00
|
|
|
# Extract min, max, and middle values
|
|
|
|
mincut <- min(cuts, na.rm = TRUE)
|
|
|
|
maxcut <- max(cuts, na.rm = TRUE)
|
2021-04-20 18:16:44 +03:00
|
|
|
|
2020-11-18 12:57:19 +03:00
|
|
|
midcut <- cuts[!cuts %in% mincut] # Excludes mincut only
|
|
|
|
midcut_min_1 <- cuts[match(midcut, cuts) - 1] # one value smaller
|
2021-04-20 18:16:44 +03:00
|
|
|
mincut_2 <- midcut_min_1[[1]] # second smallest cut
|
|
|
|
|
|
|
|
# Min and max values of `metric`
|
|
|
|
minval <- min(metric, na.rm = TRUE)
|
|
|
|
maxval <- max(metric, na.rm = TRUE)
|
|
|
|
|
|
|
|
# Warn if smaller lbound or larger ubound
|
|
|
|
if(minval < lbound){
|
|
|
|
warning("`lbound` does not capture the smallest value in `metric`. ",
|
|
|
|
"Values smaller than `lbound` will be classified as NA. ",
|
|
|
|
"Adjusting `lbound` is recommended.")
|
|
|
|
}
|
|
|
|
|
|
|
|
if(maxval > ubound){
|
|
|
|
warning("`ubound` does not capture the largest value in `metric`. ",
|
|
|
|
"Values larger than `ubound` will be classified as NA. ",
|
|
|
|
"Adjusting `ubound` is recommended.")
|
|
|
|
}
|
|
|
|
|
|
|
|
# Take smallest or largest of both values
|
|
|
|
lbound <- min(c(mincut, lbound), na.rm = TRUE)
|
|
|
|
ubound <- max(c(maxcut, ubound), na.rm = TRUE)
|
2020-11-18 12:57:19 +03:00
|
|
|
|
|
|
|
# Individual labels
|
2020-11-18 16:59:41 +03:00
|
|
|
label_mincut <- paste0("< ", mincut, " ", unit)
|
|
|
|
label_maxcut <- paste0(maxcut, "+ ", unit)
|
|
|
|
label_midcut <- paste0(midcut_min_1, " - ", midcut, " ", unit)
|
2020-11-18 12:57:19 +03:00
|
|
|
|
|
|
|
# All labels
|
2021-04-20 18:16:44 +03:00
|
|
|
all_labels <- unique(c(label_mincut, label_midcut, label_maxcut))
|
|
|
|
|
|
|
|
# If `lbound` or `ubound` conflict with cuts
|
|
|
|
if(lbound == mincut){
|
|
|
|
|
|
|
|
all_labels <- all_labels[all_labels != label_mincut]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if(ubound == maxcut){
|
|
|
|
|
|
|
|
all_labels <- all_labels[all_labels != label_maxcut]
|
|
|
|
|
|
|
|
}
|
2020-11-18 12:57:19 +03:00
|
|
|
|
2021-04-20 18:16:44 +03:00
|
|
|
# Debugging chunk ---------------------------------------------------------
|
|
|
|
# list(
|
|
|
|
# breaks = unique(c(lbound, cuts, ubound)),
|
|
|
|
# lbound,
|
|
|
|
# ubound,
|
|
|
|
# all_labels
|
|
|
|
# )
|
2020-11-18 12:57:19 +03:00
|
|
|
|
2021-04-20 18:16:44 +03:00
|
|
|
# Return result
|
2020-11-18 12:57:19 +03:00
|
|
|
cut(metric,
|
2021-04-20 18:16:44 +03:00
|
|
|
breaks = unique(c(lbound, cuts, ubound)),
|
2020-11-18 12:57:19 +03:00
|
|
|
include.lowest = TRUE,
|
|
|
|
labels = all_labels)
|
2020-10-27 00:21:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|