# -------------------------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See LICENSE.txt in the project root for license information. # -------------------------------------------------------------------------------------------- #' @title Create count of distinct fields and percentage of employees with NAs for all HR variables #' #' @description #' This function enables you to create a summary table to validate organizational data. This table will provide a summary of the data found #' in the WpA Sources page. #' This function will return a summary table with the count of distinct fields per HR attribute and the percentage of #' employees with NAs for that attribute. #' See hrvar_count function for more detail on the specific HR attribute of interest. #' #' @param data A Standard Query dataset in the form of a data frame. #' @param n_var number of HR variables to include in report as rows. Default is set to 10 HR variables. #' @param return String to specify what to return #' @param threshold The max number of unique values allowed for any attribute. Default is 100. #' @param maxna The max percentage of NAs allowable for any column. Default is 20. #' #' @import dplyr #' #' @family Data Validation #' #' @return #' Returns an error message by default, where 'text' is passed in `return`. #' When 'table' is passed, a summary table listing the number of distinct fields and percentage of NAs for the specified number of HR attributes will be returned. #' when 'message' is passed, outputs a message indicating which values are beyond the specified thresholds. #' #' @export hrvar_count_all <- function(data, n_var = 50, return = "message", threshold = 100, maxna = 20 ){ ## Character vector of HR attributes extracted_chr <- extract_hr(data, return = "names") summary_table_n <- data %>% select(PersonId, extracted_chr) %>% summarise_at(vars(extracted_chr), ~n_distinct(.,na.rm = TRUE)) # Excludes NAs from unique count ## Note: WPA here is used for a matching separator results <- data %>% select(PersonId, extracted_chr) %>% summarise_at(vars(extracted_chr), list(`WPAn_unique` = ~n_distinct(., na.rm = TRUE), # Excludes NAs from unique count `WPAper_na` = ~(sum(is.na(.))/ nrow(data) * 100), `WPAsum_na` = ~sum(is.na(.)) # Number of missing values )) %>% # % of missing values tidyr::gather(attribute, values) %>% tidyr::separate(col = attribute, into = c("attribute", "calculation"), sep = "_WPA") %>% tidyr::spread(calculation, values) ## Single print message if(sum(results$n_unique >= threshold)==0){ printMessage <- paste("No attributes have greater than", threshold, "unique values.") } if(sum(results$per_na >= maxna)==0){ newMessage <- paste("No attributes have more than", maxna, "percent NA values.") printMessage <- paste(printMessage, newMessage, collapse = "\n") } for (i in 1:nrow(results)) { if(results$n_unique[i] >= threshold){ newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of unique values. Please check.") printMessage <- paste(printMessage, newMessage, collapse = "\n") } if(results$per_na[i]>=maxna){ newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of NA values. Please check.") printMessage <- paste(printMessage, newMessage, collapse = "\n") } } if(return == "table"){ results <- results %>% select(Attributes = "attribute", `Unique values` = "n_unique", `Total missing values` = "sum_na", `% missing values` = "per_na") return(utils::head(results, n_var)) } else if(return == "text"){ printMessage } else if(return == "message"){ message(printMessage) } else { stop("Error: please check inputs for `return`") } }