wpa/R/hrvar_count_all.R

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

#' @title Create count of distinct fields and percentage of employees with NAs for all HR variables
#'
#' @description
#' This function enables you to create a summary table to validate organizational data. This table will provide a summary of the data found
#' in the WpA Sources page.
#' This function will return a summary table with the count of distinct fields per HR attribute and the percentage of
#' employees with NAs for that attribute.
#' See hrvar_count function for more detail on the specific HR attribute of interest.
#'
#' @param data A Standard Query dataset in the form of a data frame.
#' @param n_var number of HR variables to include in report as rows. Default is set to 10 HR variables.
#' @param return String to specify what to return
#' @param threshold The max number of unique values allowed for any attribute. Default is 100.
#' @param maxna The max percentage of NAs allowable for any column. Default is 20.
#'
#' @import dplyr
#'
#' @family Data Validation
#'
#' @return
#' Returns an error message by default, where 'text' is passed in `return`.
#' When 'table' is passed, a summary table listing the number of distinct fields and percentage of NAs for the specified number of HR attributes will be returned.
#' when 'message' is passed, outputs a message indicating which values are beyond the specified thresholds.
#'
#' @export
hrvar_count_all <- function(data,
                            n_var = 50,
                            return = "message",
                            threshold = 100,
                            maxna = 20
                            ){

  ## Character vector of HR attributes
  extracted_chr <- extract_hr(data, return = "names")

  summary_table_n <-
    data %>%
    select(PersonId, extracted_chr) %>%
    summarise_at(vars(extracted_chr), ~n_distinct(.,na.rm = TRUE)) # Excludes NAs from unique count

  ## Note: WPA here is used for a matching separator
  results <-
    data %>%
    select(PersonId, extracted_chr) %>%
    summarise_at(vars(extracted_chr),
                 list(`WPAn_unique` = ~n_distinct(., na.rm = TRUE), # Excludes NAs from unique count
                      `WPAper_na` = ~(sum(is.na(.))/ nrow(data) * 100),
                      `WPAsum_na` = ~sum(is.na(.)) # Number of missing values
                      )) %>% # % of missing values
    tidyr::gather(attribute, values) %>%
    tidyr::separate(col = attribute, into = c("attribute", "calculation"), sep = "_WPA") %>%
    tidyr::spread(calculation, values)

    ## Single print message
    if(sum(results$n_unique >= threshold)==0){

      printMessage <- paste("No attributes have greater than", threshold, "unique values.")
    }

    if(sum(results$per_na >= maxna)==0){
      newMessage <- paste("No attributes have more than", maxna, "percent NA values.")
      printMessage <- paste(printMessage, newMessage, collapse = "\n")

    }

    for (i in 1:nrow(results)) {
      if(results$n_unique[i] >= threshold){

        newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of unique values. Please check.")
        printMessage <- paste(printMessage, newMessage, collapse = "\n")
        }

      if(results$per_na[i]>=maxna){

        newMessage <- paste0("The attribute '",results$attribute[i],"' has a large amount of NA values. Please check.")
        printMessage <- paste(printMessage, newMessage, collapse = "\n")
        }
    }

    if(return == "table"){

      results <-
        results %>%
        select(Attributes = "attribute",
               `Unique values` = "n_unique",
               `Total missing values` = "sum_na",
               `% missing values` = "per_na")

      return(utils::head(results, n_var))

    } else if(return == "text"){

      printMessage

    } else if(return == "message"){

      message(printMessage)

    } else {

      stop("Error: please check inputs for `return`")

    }
}