This commit is contained in:
Родитель
6ff9d9a9d7
Коммит
7c6b465042
|
@ -1,27 +1,38 @@
|
|||
#' CausalGrid: A package for subgroup effects
|
||||
#'
|
||||
#' This package helps you identify and analyze subgroups within a sample.
|
||||
#' Tools for finding heterogeneous treatment effects (and means) based on
|
||||
#' partitioning the covariate/feature space via full cross-cuts and solved via
|
||||
#' greedy search. A typical usage would be analyzing and experiment to find the
|
||||
#' high-level subgroups (a coarse partition that is useful to humans) that
|
||||
#' differ in their estimated treatment effects.
|
||||
#'
|
||||
#' This package is inspired by, and uses ideas from, \code{Causal
|
||||
#' Tree} but aims to have the
|
||||
#' partition be more interpretable and have better accuracy. It is slower,
|
||||
#' though for high-level partitions this is usually not an issue.
|
||||
#'
|
||||
#' Subgroups are constructed as a grid over the features/covariates \code{X}.
|
||||
#' For example, with 1 feature going from 0 to 1 it may split at values c1, c2,
|
||||
#' resulting in segments [0,c1], (c1, c2], (c2,1]. A split at value c means it
|
||||
#' splits <= and >. The segments may be of uneven sizes. Splits along several
|
||||
#' features result in a grid by constructing the cartesian product of the
|
||||
#' feature-specific splits. Not all features will necessarily be split or split
|
||||
#' the same number of times.
|
||||
#' resulting in segments \code{[0,c1], (c1, c2], (c2,1]}. A split at value
|
||||
#' \code{c} means it splits <= and >. The segments may be of uneven sizes.
|
||||
#' Splits along several features result in a grid by constructing the Cartesian
|
||||
#' product of the feature-specific splits. Not all features will necessarily be
|
||||
#' split or split the same number of times.
|
||||
#'
|
||||
#' The main entry point is \code{\link{fit_estimate_partition}}.
|
||||
#'
|
||||
#' Randomization: This package should be able to be run with no randomness. With
|
||||
#' default/simple parameters the following places randomize but can be
|
||||
#' overridden. \itemize{
|
||||
#' \item Generating train/est splits. Can be overridden by providing
|
||||
#' \code{tr_split}
|
||||
#' \item Generating trtr/trcv splits. Can be overridden by providing
|
||||
#' \code{cv_folds}
|
||||
#' \item Bumping samples. Can be overriddgen by providing list of samples for
|
||||
#' \code{bump_samples}
|
||||
#' \item Estimation plans: Provide ones ( \code{lm_est(lasso=TRUE,...)} and
|
||||
#' \code{grid_rf}) use \code{cv_folds}. User-made ones should too.
|
||||
#' overridden. \itemize{
|
||||
#' \item Generating train/est splits. Can be overridden
|
||||
#' by providing \code{tr_split}
|
||||
#' \item Generating trtr/trcv splits. Can be
|
||||
#' overridden by providing \code{cv_folds}
|
||||
#' \item Bumping samples. Can be
|
||||
#' overridden by providing list of samples for \code{bump_samples}
|
||||
#' \item
|
||||
#' Estimation plans: Provide ones ( \code{lm_est(lasso=TRUE,...)} and
|
||||
#' \code{grid_rf}) use \code{cv_folds}. User-made ones should too.
|
||||
#' }
|
||||
#'
|
||||
#' @useDynLib CausalGrid, .registration = TRUE
|
||||
|
@ -120,7 +131,8 @@ NULL
|
|||
#'
|
||||
#' Usability:
|
||||
#'
|
||||
#' - Documentation descriptions (separate from titles)
|
||||
#' - Documentation descriptions (separate from titles). Define the bump samples
|
||||
#' explicit case.
|
||||
#'
|
||||
#' - Have nicer factor labels (especially if split at bottom point, make [T,T]
|
||||
#' rather than [T,T+1], and redo top (to not have -1))
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# Project
|
||||
Tools for finding heterogeneous treatment effects (and means) based on partitioning the covariate/feature space via full cross cuts and solved via greedy search. These splits go across the entire sample rather than done on increasingly smaller blocks (as [Causal Tree])(https://github.com/susanathey/causalTree) does).
|
||||
Tools for finding heterogeneous treatment effects (and means) based on partitioning the covariate/feature space via full cross-cuts and solved via greedy search. A typical usage would be analyzing and experiment to find the high-level subgroups (a coarse partition that is useful to humans) that differ in their estimated treatment effects.
|
||||
|
||||
This package is inspired by, and uses ideas from, [Causal Tree](https://github.com/susanathey/causalTree) but aims to have the partition be more interpretable and have better accuracy. It is slower, though for high-level partitions this is usually not an issue.
|
||||
|
||||
This project is currently in an advanced prototype stage. Issues may still be found in common usage. Please create issues for these!
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ Some common parameters
|
|||
set.seed(1337) #data differs across sims, but the bump samples indexes are the same across sims
|
||||
N=500
|
||||
n_bump = 20
|
||||
S=100
|
||||
cores_to_use = getOption("cl.cores", default=1)
|
||||
S=200
|
||||
cores_to_use = getOption("cl.cores", default=5)
|
||||
```
|
||||
|
||||
|
||||
|
@ -57,6 +57,8 @@ if(cores_to_use>1){
|
|||
}
|
||||
|
||||
# using doSNOW instead of generic doparallel as it has a nice progress bar option.
|
||||
t1 = Sys.time()
|
||||
cat(paste("Start time: ",t1,"\n"))
|
||||
pb <- txtProgressBar(max = S, style = 3)
|
||||
progress_fn <- function(n) setTxtProgressBar(pb, n)
|
||||
n_methods = 1 + 1 + 1 + 4
|
||||
|
@ -97,7 +99,10 @@ sim_rets = foreach(s=1:S, .packages=c("CausalGrid"), .options.snow = list(progre
|
|||
list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells)
|
||||
#sim_rets[[s]] = list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells)
|
||||
}
|
||||
t2 = Sys.time() #can us as.numeric(t1) to convert to seconds
|
||||
td = t2-t1
|
||||
close(pb)
|
||||
cat(paste("Total time: ",format(as.numeric(td))," ", attr(td,"units"),"\n"))
|
||||
if(cores_to_use>1) stopCluster(pr_cl)
|
||||
```
|
||||
|
||||
|
@ -128,5 +133,5 @@ print(colMeans(n_cells_mat[,1:2]))
|
|||
|
||||
print("CV Depth")
|
||||
print(paste("n_cells mean (4 is perfect):", paste(cv_names, collapse=", ")))
|
||||
print(colMeans(n_cells_mat[,3:n_methods]))
|
||||
print(colMeans(n_cells_mat[,3:n_methods])) #including complexity limits the #. Using Bumping in CV means we can tolearte more complexity when picking optimal lambda. And moving from no bumping to bumping the benefit of complexity is more so we have more cells.
|
||||
```
|
||||
|
|
|
@ -52,8 +52,8 @@ breaks_k = seq(nbreaks_k)/(nbreaks_k+1)
|
|||
breaks = list(breaks_k, breaks_k, breaks_k)
|
||||
tau = tau_fn(X)
|
||||
d = rep(c(0,1), N/2) #treatment assignment
|
||||
S = 100
|
||||
err_sds = c(1,0.1, 0.01, 0.001) #c(1,0.1, 0.01, 0.001)
|
||||
S = 200
|
||||
err_sds = c(1, 0.3, 0.1, 0.03) #c(1,0.1, 0.01, 0.001)
|
||||
Ks = c(2,5,10) #c(2,5,10)
|
||||
cores_to_use = getOption("cl.cores", default=5)
|
||||
```
|
||||
|
@ -85,13 +85,15 @@ if(cores_to_use>1){
|
|||
|
||||
# using foreach structure so I can export some globals (couldn't see how to do it with parLapply).
|
||||
# using doSNOW instead of generic doparallel as it has a nice progress bar option.
|
||||
t1 = Sys.time()
|
||||
cat(paste("Start time: ",t1,"\n"))
|
||||
pb <- txtProgressBar(max = S, style = 3)
|
||||
progress <- function(n) setTxtProgressBar(pb, n)
|
||||
opts <- list(progress = progress)
|
||||
sim_rets = foreach(s=1:S, .packages=c("gsubfn","CausalGrid"), .options.snow = opts) %doChange% { #.errorhandling = "pass", .export=c("tau_fn", "eval_mse")
|
||||
n_cell_cv_s = n_cell_cv_tilde_s = n_cell_cv_rest_s = n_cell_cv_1se_s = matrix(NA, nrow=length(err_sds), ncol=length(Ks))
|
||||
for(err_sd_i in 1:length(err_sds)){
|
||||
y = y_mat[,err_sd_i]
|
||||
y = y_mats[[s]][,err_sd_i]
|
||||
for(k_i in 1:length(Ks)) {
|
||||
set.seed(1337)
|
||||
k = Ks[k_i]
|
||||
|
@ -116,7 +118,10 @@ sim_rets = foreach(s=1:S, .packages=c("gsubfn","CausalGrid"), .options.snow = op
|
|||
}
|
||||
list(n_cell_cv_s=n_cell_cv_s, n_cell_cv_tilde_s=n_cell_cv_tilde_s, n_cell_cv_rest_s=n_cell_cv_rest_s, n_cell_cv_1se_s=n_cell_cv_1se_s)
|
||||
}
|
||||
t2 = Sys.time() #can us as.numeric(t1) to convert to seconds
|
||||
td = t2-t1
|
||||
close(pb)
|
||||
cat(paste("Total time: ",format(as.numeric(td))," ", attr(td,"units"),"\n"))
|
||||
if(cores_to_use>1) stopCluster(pr_cl)
|
||||
```
|
||||
|
||||
|
@ -141,13 +146,16 @@ n_cell_cv_1se = apply(n_cell_cv_1se, c(2,3), mean)
|
|||
|
||||
```{r}
|
||||
print("CV n_cell mean")
|
||||
print(n_cell_cv_mean)
|
||||
print(n_cell_cv_mean) #Generally increases with k and as err_sd falls. With k because can grow deeper partitions (more chances for good anomolous results) on the majority of the data and mostly because it's now harder to estimate test (see next set where we do perfectly.). With this type of setup we'd always expect overly complex models (even if true DGP is more complicated than one split) as going below has true cost.
|
||||
#print("num sims with n_cell_cv>2")
|
||||
#print(n_cell_cv_over)
|
||||
print("CV (w/ oracle test effect) n_cell mean")
|
||||
print(n_cell_cv_tilde_mean)
|
||||
print("CV (rest to max_split=3) n_cell mean")
|
||||
print("CV (rest to max_split=3) n_cell mean") #fewer chances to get anomolously good results
|
||||
print(n_cell_cv_rest)
|
||||
print("CV (w/ lambda_1se) n_cell mean")
|
||||
print("CV (w/ lambda_1se) n_cell mean") #Does much better given the true objective function is flat .
|
||||
print(n_cell_cv_1se)
|
||||
```
|
||||
```{r}
|
||||
n_cell_cv_1se
|
||||
```
|
|
@ -2,15 +2,12 @@
|
|||
# - To change parallel/single-threaded, search for PARALLEL comments
|
||||
# - There is a "suffix" for output files so that one can test w/o overwritting
|
||||
|
||||
# TODO:
|
||||
# - Switch to using my_apply from package so that I don't need separate doparallel package
|
||||
|
||||
library(gsubfn)
|
||||
library(devtools)
|
||||
suppressPackageStartupMessages(library(data.table)) # suppressPackageStartupMessages(library(assertthat))
|
||||
library(CausalGrid)
|
||||
library(causalTree)
|
||||
library(doParallel)
|
||||
library(doSNOW)
|
||||
library(foreach)
|
||||
library(xtable)
|
||||
library(stringr)
|
||||
|
@ -38,6 +35,7 @@ minsize=25
|
|||
S = 100 #3 100, TODO: Is this just 1!?
|
||||
Ns = c(500, 1000) #c(500, 1000)
|
||||
N_test = 8000
|
||||
N_bumps = 20
|
||||
D = 5 #3
|
||||
Ks = c(2, 2, 10, 20)
|
||||
good_features = list(c(T, F), c(T, T, rep(F, 8)), c(rep(T, 4), rep(F, 16)), c(T,T), c(T,T))
|
||||
|
@ -47,7 +45,7 @@ NIters = NN*D*S
|
|||
|
||||
|
||||
#Execution config
|
||||
n_parallel = 5 #1 5; PARALLEL
|
||||
n_parallel = getOption("cl.cores", default=5) #1 5; PARALLEL
|
||||
my_seed = 1337
|
||||
set.seed(my_seed)
|
||||
rf.num.threads = 1 #NULL will multi-treatd, doesn't seem to help much with small data
|
||||
|
@ -103,24 +101,24 @@ sim_eval_ct <- function(data1, data2, good_mask, honest=FALSE) {
|
|||
return(list(ct_fit, nl, mse, ngood, ntot))
|
||||
}
|
||||
|
||||
sim_cg_fit <- function(y, X, w, tr_sample, verbosity=0, honest=FALSE, do_rf=FALSE, num.threads=rf.num.threads, num.trees=100, ...) {
|
||||
sim_cg_fit <- function(y, X, w, tr_sample, verbosity=0, do_rf=FALSE, num.threads=rf.num.threads, num.trees=100, ...) {
|
||||
set.seed(my_seed)
|
||||
if(do_rf) {
|
||||
return(fit_estimate_partition(y, X, d=w, tr_split=tr_sample, cv_folds=nfolds, verbosity=verbosity,
|
||||
min_size=2*minsize, max_splits=10, bucket_min_d_var=TRUE, bucket_min_n=2*b,
|
||||
honest=honest, ctrl_method=grid_rf(num.threads=num.threads, num.trees=num.trees),
|
||||
ctrl_method=grid_rf(num.threads=num.threads, num.trees=num.trees),
|
||||
nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
|
||||
}
|
||||
else {
|
||||
return(fit_estimate_partition(y, X, d=w, tr_split=tr_sample, cv_folds=nfolds, verbosity=verbosity,
|
||||
min_size=2*minsize, max_splits=10, bucket_min_d_var=TRUE, bucket_min_n=2*b,
|
||||
honest=honest, nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
|
||||
nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
|
||||
}
|
||||
}
|
||||
|
||||
sim_cg_vectors <- function(grid_fit, mask, X_te, tau_te) {
|
||||
nsplits = grid_fit$partition$nsplits_by_dim
|
||||
preds = predict.estimated_partition(grid_fit, new_X=X_te)
|
||||
preds = predict(grid_fit, new_X=X_te)
|
||||
cg_mse = mean((preds - tau_te)^2)
|
||||
return(c(num_cells(grid_fit), cg_mse, sum(nsplits[mask]), sum(nsplits)))
|
||||
}
|
||||
|
@ -175,11 +173,12 @@ ct_h_nl = results_ct_h[,4]
|
|||
# Eval CG -----
|
||||
cat("Eval CG\n")
|
||||
|
||||
|
||||
`%doChange%` = if(n_parallel>1) `%dopar%` else `%do%` #so I can just change variable without changing other code for single-threaded
|
||||
if(n_parallel>1) {
|
||||
if(file.exists(log_file)) file.remove(log_file)
|
||||
cl <- makeCluster(n_parallel, outfile=log_file)
|
||||
registerDoParallel(cl)
|
||||
#cl <- makeCluster(n_parallel)
|
||||
registerDoSNOW(cl)
|
||||
}
|
||||
|
||||
bar_length = if(n_parallel>1) NN*D else S*NN*D
|
||||
|
@ -187,7 +186,6 @@ t1 = Sys.time()
|
|||
cat(paste("Start time: ",t1,"\n"))
|
||||
pb = utils::txtProgressBar(0, bar_length, style = 3)
|
||||
run=1
|
||||
N_bumps = 20
|
||||
outer_results = list()
|
||||
for(d in 1:D) {
|
||||
for(N_i in 1:NN) {
|
||||
|
@ -195,9 +193,9 @@ for(d in 1:D) {
|
|||
utils::setTxtProgressBar(pb, run)
|
||||
run = run+1
|
||||
}
|
||||
results_s = list() #PARALLEL: comment-out
|
||||
#results_s = list() #PARALLEL: comment-out
|
||||
#for(s in 1:S){ #PARALLEL: comment-out, uncomment next
|
||||
results_s = foreach(s=1:S, .packages=c("proto","gsubfn","rpart", "rpart.plot", "data.table","causalTree", "ranger", "lattice", "ggplot2", "caret", "Matrix", "foreach", "CausalGrid"), .errorhandling = "pass") %dopar% { #, .combine=rbind
|
||||
results_s = foreach(s=1:S, .packages=c("proto","gsubfn","rpart", "rpart.plot", "data.table","causalTree", "ranger", "lattice", "ggplot2", "caret", "Matrix", "foreach", "CausalGrid"), .errorhandling = "pass") %doChange% { #, .combine=rbind
|
||||
if(n_parallel==1) {
|
||||
utils::setTxtProgressBar(pb, run)
|
||||
run = run+1
|
||||
|
@ -210,11 +208,11 @@ for(d in 1:D) {
|
|||
tr_sample = c(rep(TRUE, N), rep(FALSE, N))
|
||||
|
||||
|
||||
grid_a_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE)
|
||||
grid_a_LassoCV_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, ctrl_method="LassoCV")
|
||||
grid_a_RF_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, do_rf=TRUE)
|
||||
grid_a_LassoCV_b_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, ctrl_method="LassoCV", bump_samples=N_bumps)
|
||||
grid_a_RF_b_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, do_rf=TRUE, bump_samples=N_bumps)
|
||||
grid_a_fit <- sim_cg_fit(y, X, w, tr_sample)
|
||||
grid_a_LassoCV_fit <- sim_cg_fit(y, X, w, tr_sample, ctrl_method="LassoCV")
|
||||
grid_a_RF_fit <- sim_cg_fit(y, X, w, tr_sample, do_rf=TRUE)
|
||||
grid_a_LassoCV_b_fit <- sim_cg_fit(y, X, w, tr_sample, ctrl_method="LassoCV", bump_samples=N_bumps)
|
||||
grid_a_RF_b_fit <- sim_cg_fit(y, X, w, tr_sample, do_rf=TRUE, bump_samples=N_bumps)
|
||||
|
||||
grid_a_am_fit <- change_complexity(grid_a_fit, y, X, d=w, which.min(abs(ct_a_nl[iter] - (grid_a_fit$complexity_seq + 1))))
|
||||
grid_a_LassoCV_b_hm_fit <- change_complexity(grid_a_LassoCV_b_fit, y, X, d=w, which.min(abs(ct_h_nl[iter] - (grid_a_LassoCV_b_fit$complexity_seq + 1))))
|
||||
|
|
Загрузка…
Ссылка в новой задаче