Updating p_test to include T/F for paired
Updating period_change to use p_test() to calculate p-values
This commit is contained in:
m-m-powers 2021-01-08 08:56:42 -08:00
Родитель c409acf02f
Коммит 661a1e0a6e
2 изменённых файлов: 301 добавлений и 2 удалений

Просмотреть файл

@ -13,6 +13,7 @@
#' @param outcome A string specifying a binary variable, i.e. can only contain
#' the values 1 or 0. Used to group the two distributions.
#' @param behavior A character vector specifying the column to be used as the behavior to test.
#' @param paired Specify whether the dataset is paired or not. Defaults to TRUE.
#'
#' @import dplyr
#' @import stats
@ -27,7 +28,8 @@
p_test <- function(data,
outcome,
behavior){
behavior,
paired = FALSE){
train <- data %>%
filter(!!sym(outcome) == 1 | !!sym(outcome) == 0) %>%
select(!!sym(outcome), !!sym(behavior)) %>%
@ -37,6 +39,6 @@ p_test <- function(data,
pos <- train %>% filter(outcome == 1, na.rm=TRUE) %>% select(behavior)
neg <- train %>% filter(outcome == 0, na.rm=TRUE) %>% select(behavior)
s <- stats::wilcox.test(unlist(pos), unlist(neg), paired = FALSE)
s <- stats::wilcox.test(unlist(pos), unlist(neg), paired = paired)
return(s$p.value)
}

297
period_change.R Normal file
Просмотреть файл

@ -0,0 +1,297 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
#' @title Plot the distribution of percentage change between periods
#' of a WpA metric by the number of employees.
#'
#' @description
#' This function also presents the p-value for the null hypothesis
#' that the variable has not changed, using a Wilcox signed-rank test.
#'
#' @param data Person Query as a dataframe including date column named "Date"
#' This function assumes the data format is MM/DD/YYYY as is standard in a WpA query output.
#' @param compvar WpA comparison variable to compare person change before and after
#' For example, "Collaboration_hours"
#' @param before_start Start date of "before" time period in YYYY-MM-DD
#' @param before_end End date of "before" time period in YYYY-MM-DD
#' @param after_start Start date of "after" time period in YYYY-MM-DD
#' @param after_end End date of "after" time period in YYYY-MM-DD
#' @param return Character vector specifying whether to return plot as Count or Percentage of Employees
#' Valid inputs include "count" (default), "percentage", and "table".
#'
#' @import dplyr
#' @import reshape2
#' @import ggplot2
#' @import scales
#' @importFrom stats wilcox.test
#'
#' @examples
#' ## Run plot
#' period_change(sq_data, compvar = "Workweek_span", before_end = "2019-11-16")
#'
#' ## Run plot with more specific arguments
#' @example
#' period_change(sq_data,
#' compvar = "Workweek_span",
#' before_start = "2019-11-03",
#' before_end = "2019-11-16",
#' after_start = "2019-12-03",
#' after_end = "2019-12-16",
#' return = "percentage")
#'
#' @family Flexible Input
#' @export
period_change <-
function(data,
compvar,
before_start = min(as.Date(data$Date, "%m/%d/%Y")),
before_end,
after_start = as.Date(before_end) + 1,
after_end = max(as.Date(data$Date, "%m/%d/%Y")),
return = "count") {
## Check inputs
## Update these column names as per appropriate
required_variables <- c("Date",
compvar,
"PersonId")
## Error message if variables are not present
## Nothing happens if all present
data %>%
check_inputs(requirements = required_variables)
daterange_1_start <- as.Date(before_start)
daterange_1_end <- as.Date(before_end)
daterange_2_start <- as.Date(after_start)
daterange_2_end <- as.Date(after_end)
# Fix dates format for WpA Queries
WpA_dataset <- data %>% mutate(Date = as.Date(Date, "%m/%d/%Y"))
# Check for dates in data file
# if (daterange_1_start < min(WpA_dataset$Date) |
# daterange_1_start > max(WpA_dataset$Date) |
# daterange_1_end < min(WpA_dataset$Date) |
# daterange_1_end > max(WpA_dataset$Date) |
# daterange_2_start < min(WpA_dataset$Date) |
# daterange_2_start > max(WpA_dataset$Date) |
# daterange_2_end < min(WpA_dataset$Date) |
# daterange_2_end > max(WpA_dataset$Date)) {
# stop('Dates not found in dataset')
# geterrmessage()
# }
# Create variable => Period
WpA_dataset_table <-
WpA_dataset %>% mutate(
Period = case_when(
Date >= daterange_1_start &
Date <= daterange_1_end ~ "Before",
Date >= daterange_2_start &
Date <= daterange_2_end ~ "After"
)
) %>% filter(Period == "Before" | Period == "After")
# Group data by
mydata_table <-
WpA_dataset_table %>%
select(PersonId, Period, all_of(compvar)) %>%
group_by(Period, PersonId) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
# Turn to Long
data_wide <- tidyr::gather(mydata_table, KPI, value,-Period,-PersonId)
# Turn to Wide
data_final <- tidyr::spread(data_wide, Period, value)
# Drop nas
data_final <- data_final %>% tidyr::drop_na()
data_wide <- data_final %>% tidyr::pivot_longer(c(After, Before), names_to = "Period", values_to = "value")
# Calculate total means and differences
means_table <- mydata_table %>%
group_by(Period) %>%
summarize(mean = mean(!!sym(compvar))) %>%
spread(., Period, mean) %>%
mutate(diff = After - Before) %>%
mutate(perc_diff = diff/Before)
# run Wilcox Signed-Rank test
test_set <- data_wide %>% mutate(X = ifelse(Period == "Before", 0, 1))
res <- p_test(test_set, outcome = "X", behavior = "value", paired = TRUE)
p_val <- signif(res, digits = 3)
data_summary <- means_table %>%
mutate(pval = p_val)
# Calculate change between periods
data_final <- data_final %>% mutate(delta = After - Before)
# Calculate percent change between periods
data_final <-
data_final %>% mutate(perc_diff = (After - Before) / Before)
# Drop NA and Errors
data_final <- data_final %>% tidyr::drop_na(delta)
data_final <-
data_final %>% filter(Before > 0) #filters out people who joined the organization after the 'Before' period
# Group 100%+ together so you can see how many are over 100% in a plot data set
data_plot <- data_final
data_plot$perc_diff[which(data_plot$perc_diff > 1)] <- 1.01
# Categorize perc_diff by percentages
data_plot <-
data_plot %>% mutate(Mybins = ggplot2::cut_width(
perc_diff,
width = 0.1,
boundary = 0,
labels = F
))
# create x-axis labels and replace Mybins with label
labels = c(paste(seq(-100, 100, 10), "%", sep = ""), "100%+")
x_axis <- vector("character", 21)
for (i in 1:20) {
x_axis[i] <- paste(labels[i], "-", labels[i + 1])
}
x_axis[21] <- labels[21]
data_plot <- data_plot %>% mutate(Mybins = x_axis[Mybins])
data_plot <-
data_plot %>%
mutate(Mybins = factor(Mybins, levels = x_axis))
#calculate percentage of employees in each bin
data_legend <-
data_plot %>%
group_by(Mybins) %>%
summarize(Employee_Count = n_distinct(PersonId))
data_legend <- data.frame(data_legend)
data_legend <-
data_legend %>%
mutate(Employee_Counts = paste("n=", Employee_Count))
data_legend <-
data_legend %>%
mutate(Employee_perc = Employee_Count / sum(data_legend$Employee_Count))
# create summary table for table output
summary_table <-
data_legend %>%
select(Mybins, Employee_Count, Employee_perc)
if (return == "count") {
# Plot barplot of Count of Employees with % change
data_plot %>%
ggplot(aes(x = Mybins)) +
geom_bar(fill = "#203864") +
scale_x_discrete(name = "Percent change") +
scale_y_continuous(name = "Employee count") +
theme_wpa_basic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(
title = paste("Number of employees with change in", compvar),
subtitle = paste(
"Period 1:",
daterange_1_start,
"to",
daterange_1_end,
"and Period 2:",
daterange_2_start,
"to",
daterange_2_end
)
) +
labs(
caption = paste(
"Total employees =",
sum(data_legend$Employee_Count),
"| Data from",
daterange_1_start,
"to",
daterange_1_end,
"and",
daterange_2_start,
"to",
daterange_2_end,
"| p =",
p_val
)
)
} else if (return == "percentage") {
# Create histogram by % of employees changing
data_legend %>%
ggplot(aes(x = Mybins, y = Employee_perc)) +
geom_col( fill = "#203864") +
scale_x_discrete(name = "Percent change") +
scale_y_continuous(name = "Percentage of measured employees",
labels = scales::percent_format(accuracy = 1)) +
annotate(
"text",
x = data_legend$Mybins,
y = -0.005,
label = data_legend$Employee_Counts,
size = 3
) +
theme_wpa_basic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(
title = paste("Percentage of employees with change in", compvar),
subtitle = paste(
"Period 1:",
daterange_1_start,
"to",
daterange_1_end,
"and Period 2:",
daterange_2_start,
"to",
daterange_2_end
)
) +
labs(
caption = paste(
"Total employees =",
sum(data_legend$Employee_Count),
"| Data from",
daterange_1_start,
"to",
daterange_1_end,
"and",
daterange_2_start,
"to",
daterange_2_end,
"| p =",
p_val
)
)
} else if(return == "table"){
summary_table %>%
mutate_at("Employee_perc", ~round(. * 100, 1)) %>%
as_tibble() %>%
return()
} else if(return == "summary"){
data_summary %>%
rename(mean_after = After, mean_before = Before) %>%
select(mean_before, mean_after, diff, perc_diff, pval) %>%
return()
# data_wide %>% return()
# as_tibble(mean_before, mean_after, total_diff, total_perc_diff, p_val) %>% return()
} else {
stop("Please enter a valid input for `return`, either count, percentage, or table.")
}
}