P_test_integreation

Updating p_test to include T/F for paired Updating period_change to use p_test() to calculate p-values
2021-01-08 08:56:42 -08:00 · 2021-01-08 08:56:42 -08:00 · 661a1e0a6e
--- a/R/p_test.R
+++ b/R/p_test.R
@ -13,6 +13,7 @@
 #' @param outcome A string specifying a binary variable, i.e. can only contain
 #' the values 1 or 0. Used to group the two distributions.
 #' @param behavior A character vector specifying the column to be used as the behavior to test.
+#' @param paired Specify whether the dataset is paired or not. Defaults to TRUE.
 #'
 #' @import dplyr
 #' @import stats
@ -27,7 +28,8 @@

 p_test <- function(data, 
                   outcome, 
-                   behavior){
+                   behavior,
+                   paired = FALSE){
  train <- data %>% 
    filter(!!sym(outcome) == 1 | !!sym(outcome) == 0) %>%
    select(!!sym(outcome), !!sym(behavior)) %>%
@ -37,6 +39,6 @@ p_test <- function(data,
  pos <- train %>% filter(outcome == 1, na.rm=TRUE) %>% select(behavior) 
  neg <- train %>% filter(outcome == 0, na.rm=TRUE) %>% select(behavior) 
  
-  s <- stats::wilcox.test(unlist(pos), unlist(neg), paired = FALSE)
+  s <- stats::wilcox.test(unlist(pos), unlist(neg), paired = paired)
  return(s$p.value)
 }
--- a/period_change.R
+++ b/period_change.R
@ -0,0 +1,297 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+#' @title Plot the distribution of percentage change between periods
+#' of a WpA metric by the number of employees.
+#'
+#' @description
+#' This function also presents the p-value for the null hypothesis
+#'  that the variable has not changed, using a Wilcox signed-rank test.
+#'
+#' @param data Person Query as a dataframe including date column named "Date"
+#' This function assumes the data format is MM/DD/YYYY as is standard in a WpA query output.
+#' @param compvar WpA comparison variable to compare person change before and after
+#'  For example, "Collaboration_hours"
+#' @param before_start Start date of "before" time period in YYYY-MM-DD
+#' @param before_end End date of "before" time period in YYYY-MM-DD
+#' @param after_start Start date of "after" time period in YYYY-MM-DD
+#' @param after_end End date of "after" time period in YYYY-MM-DD
+#' @param return Character vector specifying whether to return plot as Count or Percentage of Employees
+#' Valid inputs include "count" (default), "percentage", and "table".
+#'
+#' @import dplyr
+#' @import reshape2
+#' @import ggplot2
+#' @import scales
+#' @importFrom stats wilcox.test
+#'
+#' @examples
+#' ## Run plot
+#' period_change(sq_data, compvar = "Workweek_span", before_end = "2019-11-16")
+#'
+#' ## Run plot with more specific arguments
+#'  @example
+#' period_change(sq_data,
+#'               compvar = "Workweek_span",
+#'               before_start = "2019-11-03",
+#'               before_end = "2019-11-16",
+#'               after_start = "2019-12-03",
+#'               after_end = "2019-12-16",
+#'               return = "percentage")
+#'
+#' @family Flexible Input
+#' @export
+
+period_change <-
+  function(data,
+           compvar,
+           before_start = min(as.Date(data$Date, "%m/%d/%Y")),
+           before_end,
+           after_start = as.Date(before_end) + 1,
+           after_end = max(as.Date(data$Date, "%m/%d/%Y")),
+           return = "count") {
+
+    ## Check inputs
+    ## Update these column names as per appropriate
+    required_variables <- c("Date",
+                            compvar,
+                            "PersonId")
+    ## Error message if variables are not present
+    ## Nothing happens if all present
+    data %>%
+      check_inputs(requirements = required_variables)
+    
+    daterange_1_start <- as.Date(before_start)
+    daterange_1_end <- as.Date(before_end)
+    daterange_2_start <- as.Date(after_start)
+    daterange_2_end <- as.Date(after_end)
+
+    # Fix dates format for WpA Queries
+     WpA_dataset <- data %>% mutate(Date = as.Date(Date, "%m/%d/%Y"))
+
+
+
+    # Check for dates in data file
+     # if (daterange_1_start < min(WpA_dataset$Date) |
+     #     daterange_1_start > max(WpA_dataset$Date) |
+     #     daterange_1_end < min(WpA_dataset$Date) |
+     #     daterange_1_end > max(WpA_dataset$Date) |
+     #     daterange_2_start < min(WpA_dataset$Date) |
+     #     daterange_2_start > max(WpA_dataset$Date) |
+     #     daterange_2_end < min(WpA_dataset$Date) |
+     #     daterange_2_end > max(WpA_dataset$Date)) {
+     #   stop('Dates not found in dataset')
+     #   geterrmessage()
+     # }
+
+    # Create variable => Period
+    WpA_dataset_table <-
+      WpA_dataset %>% mutate(
+        Period = case_when(
+          Date >= daterange_1_start &
+            Date <= daterange_1_end ~ "Before",
+          Date >= daterange_2_start &
+            Date <= daterange_2_end ~ "After"
+        )
+      ) %>% filter(Period == "Before" | Period == "After")
+
+    
+    # Group data by
+    mydata_table <-
+      WpA_dataset_table %>%
+      select(PersonId, Period, all_of(compvar)) %>%
+      group_by(Period, PersonId) %>%
+      summarise_if(is.numeric, mean, na.rm = TRUE)
+    
+    # Turn to Long
+    data_wide <- tidyr::gather(mydata_table, KPI, value,-Period,-PersonId)
+
+    # Turn to Wide
+    data_final <- tidyr::spread(data_wide, Period, value)
+
+    # Drop nas
+    data_final <- data_final %>% tidyr::drop_na()
+    data_wide <- data_final %>% tidyr::pivot_longer(c(After, Before), names_to = "Period", values_to = "value")
+    
+    # Calculate total means and differences
+    means_table <- mydata_table %>%
+      group_by(Period) %>% 
+      summarize(mean = mean(!!sym(compvar))) %>%
+      spread(., Period, mean) %>% 
+      mutate(diff = After - Before) %>% 
+      mutate(perc_diff = diff/Before)
+      
+
+    # run Wilcox Signed-Rank test
+    test_set <- data_wide %>% mutate(X = ifelse(Period == "Before", 0, 1))
+    res <- p_test(test_set, outcome = "X", behavior = "value", paired = TRUE)
+    p_val <- signif(res, digits = 3)
+
+    data_summary <- means_table %>%
+      mutate(pval = p_val)
+    
+    # Calculate change between periods
+    data_final <- data_final %>% mutate(delta = After - Before)
+
+    # Calculate percent change between periods
+    data_final <-
+      data_final %>% mutate(perc_diff = (After - Before) / Before)
+
+    # Drop NA and Errors
+    data_final <- data_final %>% tidyr::drop_na(delta)
+    data_final <-
+      data_final %>% filter(Before > 0) #filters out people who joined the organization after the 'Before' period
+
+    # Group 100%+ together so you can see how many are over 100% in a plot data set
+    data_plot <- data_final
+    data_plot$perc_diff[which(data_plot$perc_diff > 1)] <- 1.01
+
+    # Categorize perc_diff by percentages
+    data_plot <-
+      data_plot %>% mutate(Mybins = ggplot2::cut_width(
+        perc_diff,
+        width = 0.1,
+        boundary = 0,
+        labels = F
+      ))
+
+    # create x-axis labels and replace Mybins with label
+    labels = c(paste(seq(-100, 100, 10), "%", sep = ""), "100%+")
+    x_axis <- vector("character", 21)
+    for (i in 1:20) {
+      x_axis[i] <- paste(labels[i], "-", labels[i + 1])
+    }
+    x_axis[21] <- labels[21]
+    data_plot <- data_plot %>% mutate(Mybins = x_axis[Mybins])
+    data_plot <-
+      data_plot %>%
+      mutate(Mybins = factor(Mybins, levels = x_axis))
+
+    #calculate percentage of employees in each bin
+    data_legend <-
+      data_plot %>%
+      group_by(Mybins) %>%
+      summarize(Employee_Count = n_distinct(PersonId))
+
+    data_legend <- data.frame(data_legend)
+
+    data_legend <-
+      data_legend %>%
+      mutate(Employee_Counts = paste("n=", Employee_Count))
+
+    data_legend <-
+      data_legend %>%
+      mutate(Employee_perc = Employee_Count / sum(data_legend$Employee_Count))
+
+    # create summary table for table output
+    summary_table <-
+      data_legend %>%
+      select(Mybins, Employee_Count, Employee_perc)
+
+    if (return == "count") {
+      # Plot barplot of Count of Employees with % change
+      data_plot %>%
+        ggplot(aes(x = Mybins)) +
+        geom_bar(fill = "#203864") +
+        scale_x_discrete(name = "Percent change") +
+        scale_y_continuous(name = "Employee count") +
+        theme_wpa_basic() +
+        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+        labs(
+          title = paste("Number of employees with change in", compvar),
+          subtitle = paste(
+            "Period 1:",
+            daterange_1_start,
+            "to",
+            daterange_1_end,
+            "and Period 2:",
+            daterange_2_start,
+            "to",
+            daterange_2_end
+          )
+        ) +
+        labs(
+          caption = paste(
+            "Total employees =",
+            sum(data_legend$Employee_Count),
+            "| Data from",
+            daterange_1_start,
+            "to",
+            daterange_1_end,
+            "and",
+            daterange_2_start,
+            "to",
+            daterange_2_end,
+            "| p =",
+            p_val
+          )
+        )
+
+    } else if (return == "percentage") {
+      # Create histogram by % of employees changing
+      data_legend %>%
+        ggplot(aes(x = Mybins, y = Employee_perc)) +
+        geom_col( fill = "#203864") +
+        scale_x_discrete(name = "Percent change") +
+        scale_y_continuous(name = "Percentage of measured employees",
+                           labels = scales::percent_format(accuracy = 1)) +
+        annotate(
+          "text",
+          x = data_legend$Mybins,
+          y = -0.005,
+          label = data_legend$Employee_Counts,
+          size = 3
+        ) +
+        theme_wpa_basic() +
+        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+        labs(
+          title = paste("Percentage of employees with change in", compvar),
+          subtitle = paste(
+            "Period 1:",
+            daterange_1_start,
+            "to",
+            daterange_1_end,
+            "and Period 2:",
+            daterange_2_start,
+            "to",
+            daterange_2_end
+          )
+        ) +
+        labs(
+          caption = paste(
+            "Total employees =",
+            sum(data_legend$Employee_Count),
+            "| Data from",
+            daterange_1_start,
+            "to",
+            daterange_1_end,
+            "and",
+            daterange_2_start,
+            "to",
+            daterange_2_end,
+            "| p =",
+            p_val
+          )
+        )
+    } else if(return == "table"){
+      summary_table %>%
+        mutate_at("Employee_perc", ~round(. * 100, 1)) %>%
+        as_tibble() %>%
+        return()
+
+    } else if(return == "summary"){
+      data_summary %>% 
+        rename(mean_after = After, mean_before = Before) %>%
+        select(mean_before, mean_after, diff, perc_diff, pval) %>%
+        return()
+      # data_wide %>% return()
+      # as_tibble(mean_before, mean_after, total_diff, total_perc_diff, p_val) %>% return()
+        
+    } else {
+
+      stop("Please enter a valid input for `return`, either count, percentage, or table.")
+
+    }
+  }