wpa/R/hrvar_count_all.R

111 строки
4.2 KiB
R

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
#' @title Create count of distinct fields and percentage of employees with NAs for all HR variables
#'
#' @description
#' This function enables you to create a summary table to validate organizational data. This table will provide a summary of the data found
#' in the WpA Sources page.
#' This function will return a summary table with the count of distinct fields per HR attribute and the percentage of
#' employees with NAs for that attribute.
#' See hrvar_count function for more detail on the specific HR attribute of interest.
#'
#' @param data A Standard Query dataset in the form of a data frame.
#' @param n_var number of HR variables to include in report as rows. Default is set to 10 HR variables.
#' @param return String to specify what to return
#' @param threshold The max number of unique values allowed for any attribute. Default is 100.
#' @param maxna The max percentage of NAs allowable for any column. Default is 20.
#'
#' @import dplyr
#'
#' @family Data Validation
#'
#' @return
#' Returns an error message by default, where 'text' is passed in `return`.
#' When 'table' is passed, a summary table listing the number of distinct fields and percentage of NAs for the specified number of HR attributes will be returned.
#' when 'message' is passed, outputs a message indicating which values are beyond the specified thresholds.
#'
#' @export
hrvar_count_all <- function(data,
n_var = 50,
return = "message",
threshold = 100,
maxna = 20
){
## Character vector of HR attributes
extracted_chr <- extract_hr(data, return = "names")
summary_table_n <-
data %>%
select(PersonId, extracted_chr) %>%
summarise_at(vars(extracted_chr), ~n_distinct(.,na.rm = TRUE)) # Excludes NAs from unique count
## Note: WPA here is used for a matching separator
results <-
data %>%
select(PersonId, extracted_chr) %>%
summarise_at(vars(extracted_chr),
list(`WPAn_unique` = ~n_distinct(., na.rm = TRUE), # Excludes NAs from unique count
`WPAper_na` = ~(sum(is.na(.))/ nrow(data) * 100),
`WPAsum_na` = ~sum(is.na(.)) # Number of missing values
)) %>% # % of missing values
tidyr::gather(attribute, values) %>%
tidyr::separate(col = attribute, into = c("attribute", "calculation"), sep = "_WPA") %>%
tidyr::spread(calculation, values)
## Single print message
if(sum(results$n_unique >= threshold)==0){
printMessage <- paste("No attributes have greater than", threshold, "unique values.")
}
if(sum(results$per_na >= maxna)==0){
newMessage <- paste("No attributes have more than", maxna, "percent NA values.")
printMessage <- paste(printMessage, newMessage, collapse = "\n")
}
for (i in 1:nrow(results)) {
if(results$n_unique[i] >= threshold){
newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of unique values. Please check.")
printMessage <- paste(printMessage, newMessage, collapse = "\n")
}
if(results$per_na[i]>=maxna){
newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of NA values. Please check.")
printMessage <- paste(printMessage, newMessage, collapse = "\n")
}
}
if(return == "table"){
results <-
results %>%
select(Attributes = "attribute",
`Unique values` = "n_unique",
`Total missing values` = "sum_na",
`% missing values` = "per_na")
return(utils::head(results, n_var))
} else if(return == "text"){
printMessage
} else if(return == "message"){
message(printMessage)
} else {
stop("Error: please check inputs for `return`")
}
}