EDO

2021-01-22 10:34:44 -08:00 · 2021-01-22 10:34:44 -08:00 · 7c6b465042
--- a/R/CausalGrid.R
+++ b/R/CausalGrid.R
@ -1,27 +1,38 @@
 #' CausalGrid: A package for subgroup effects
 #'
-#' This package helps you identify and analyze subgroups within a sample.
+#' Tools for finding heterogeneous treatment effects (and means) based on
+#' partitioning the covariate/feature space via full cross-cuts and solved via
+#' greedy search. A typical usage would be analyzing and experiment to find the
+#' high-level subgroups (a coarse partition that is useful to humans) that
+#' differ in their estimated treatment effects.
+#'
+#' This package is inspired by, and uses ideas from, \code{Causal
+#' Tree} but aims to have the
+#' partition be more interpretable and have better accuracy. It is slower,
+#' though for high-level partitions this is usually not an issue.
+#'
 #' Subgroups are constructed as a grid over the features/covariates \code{X}.
 #' For example, with 1 feature going from 0 to 1 it may split at values c1, c2,
-#' resulting in segments [0,c1], (c1, c2], (c2,1]. A split at value c means it
-#' splits <= and >. The segments may be of uneven sizes. Splits along several
-#' features result in a grid by constructing the cartesian product of the
-#' feature-specific splits. Not all features will necessarily be split or split
-#' the same number of times.
+#' resulting in segments \code{[0,c1], (c1, c2], (c2,1]}. A split at value
+#' \code{c} means it splits <= and >. The segments may be of uneven sizes.
+#' Splits along several features result in a grid by constructing the Cartesian
+#' product of the feature-specific splits. Not all features will necessarily be
+#' split or split the same number of times.
 #'
 #' The main entry point is \code{\link{fit_estimate_partition}}.
 #'
 #' Randomization: This package should be able to be run with no randomness. With
 #' default/simple parameters the following places randomize but can be
-#' overridden. \itemize{
-#' \item  Generating train/est splits. Can be overridden by providing
-#' \code{tr_split}
-#' \item  Generating trtr/trcv splits. Can be overridden by providing
-#' \code{cv_folds}
-#' \item  Bumping samples. Can be overriddgen by providing list of samples for
-#' \code{bump_samples}
-#' \item  Estimation plans: Provide ones ( \code{lm_est(lasso=TRUE,...)} and
-#' \code{grid_rf}) use  \code{cv_folds}. User-made ones should too.
+#' overridden. \itemize{ 
+#' \item  Generating train/est splits. Can be overridden
+#' by providing \code{tr_split} 
+#' \item  Generating trtr/trcv splits. Can be
+#' overridden by providing \code{cv_folds} 
+#' \item  Bumping samples. Can be
+#' overridden by providing list of samples for \code{bump_samples} 
+#' \item
+#' Estimation plans: Provide ones ( \code{lm_est(lasso=TRUE,...)} and
+#' \code{grid_rf}) use  \code{cv_folds}. User-made ones should too. 
 #' }
 #'
 #' @useDynLib CausalGrid, .registration = TRUE
@ -120,7 +131,8 @@ NULL
 #'
 #' Usability:
 #'
-#' - Documentation descriptions (separate from titles)
+#' - Documentation descriptions (separate from titles). Define the bump samples
+#' explicit case.
 #'
 #' - Have nicer factor labels (especially if split at bottom point, make [T,T]
 #' rather than [T,T+1], and redo top (to not have -1))
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 # Project
-Tools for finding heterogeneous treatment effects (and means) based on partitioning the covariate/feature space via full cross cuts and solved via greedy search. These splits go across the entire sample rather than done on increasingly smaller blocks (as [Causal Tree])(https://github.com/susanathey/causalTree) does).
+Tools for finding heterogeneous treatment effects (and means) based on partitioning the covariate/feature space via full cross-cuts and solved via greedy search. A typical usage would be analyzing and experiment to find the high-level subgroups (a coarse partition that is useful to humans) that differ in their estimated treatment effects. 
+
+This package is inspired by, and uses ideas from, [Causal Tree](https://github.com/susanathey/causalTree) but aims to have the partition be more interpretable and have better accuracy. It is slower, though for high-level partitions this is usually not an issue.

 This project is currently in an advanced prototype stage. Issues may still be found in common usage. Please create issues for these!

--- a/project/bumping.Rmd
+++ b/project/bumping.Rmd
@ -34,8 +34,8 @@ Some common parameters
 set.seed(1337) #data differs across sims, but the bump samples indexes are the same across sims
 N=500
 n_bump = 20
-S=100
-cores_to_use = getOption("cl.cores", default=1)
+S=200
+cores_to_use = getOption("cl.cores", default=5)
 ```


@ -57,6 +57,8 @@ if(cores_to_use>1){
 } 

 # using doSNOW instead of generic doparallel as it has a nice progress bar option.
+t1 = Sys.time()
+cat(paste("Start time: ",t1,"\n"))
 pb <- txtProgressBar(max = S, style = 3)
 progress_fn <- function(n) setTxtProgressBar(pb, n)
 n_methods = 1 + 1 + 1 + 4
@ -97,7 +99,10 @@ sim_rets = foreach(s=1:S, .packages=c("CausalGrid"), .options.snow = list(progre
  list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells)
  #sim_rets[[s]] = list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells)
 }
+t2 = Sys.time() #can us as.numeric(t1) to convert to seconds
+td = t2-t1
 close(pb)
+cat(paste("Total time: ",format(as.numeric(td))," ", attr(td,"units"),"\n"))
 if(cores_to_use>1) stopCluster(pr_cl)
 ```

@ -128,5 +133,5 @@ print(colMeans(n_cells_mat[,1:2]))

 print("CV Depth")
 print(paste("n_cells mean (4 is perfect):", paste(cv_names, collapse=", ")))
-print(colMeans(n_cells_mat[,3:n_methods]))
+print(colMeans(n_cells_mat[,3:n_methods])) #including complexity limits the #. Using Bumping in CV means we can tolearte more complexity when picking optimal lambda. And moving from no bumping to bumping the benefit of complexity is more so we have more cells.
 ```
--- a/project/cv.Rmd
+++ b/project/cv.Rmd
@ -52,8 +52,8 @@ breaks_k = seq(nbreaks_k)/(nbreaks_k+1)
 breaks = list(breaks_k, breaks_k, breaks_k)
 tau = tau_fn(X)
 d = rep(c(0,1), N/2) #treatment assignment
-S = 100
-err_sds = c(1,0.1, 0.01, 0.001) #c(1,0.1, 0.01, 0.001)
+S = 200
+err_sds = c(1, 0.3, 0.1, 0.03) #c(1,0.1, 0.01, 0.001)
 Ks = c(2,5,10) #c(2,5,10)
 cores_to_use = getOption("cl.cores", default=5)
 ```
@ -85,13 +85,15 @@ if(cores_to_use>1){

 # using foreach structure so I can export some globals (couldn't see how to do it with parLapply).
 # using doSNOW instead of generic doparallel as it has a nice progress bar option.
+t1 = Sys.time()
+cat(paste("Start time: ",t1,"\n"))
 pb <- txtProgressBar(max = S, style = 3)
 progress <- function(n) setTxtProgressBar(pb, n)
 opts <- list(progress = progress)
 sim_rets = foreach(s=1:S, .packages=c("gsubfn","CausalGrid"), .options.snow = opts) %doChange% { #.errorhandling = "pass", .export=c("tau_fn", "eval_mse")
  n_cell_cv_s = n_cell_cv_tilde_s = n_cell_cv_rest_s = n_cell_cv_1se_s = matrix(NA, nrow=length(err_sds), ncol=length(Ks))
  for(err_sd_i in 1:length(err_sds)){
-    y = y_mat[,err_sd_i]
+    y = y_mats[[s]][,err_sd_i]
    for(k_i in 1:length(Ks)) {
      set.seed(1337)
      k = Ks[k_i]
@ -116,7 +118,10 @@ sim_rets = foreach(s=1:S, .packages=c("gsubfn","CausalGrid"), .options.snow = op
  }
  list(n_cell_cv_s=n_cell_cv_s, n_cell_cv_tilde_s=n_cell_cv_tilde_s, n_cell_cv_rest_s=n_cell_cv_rest_s, n_cell_cv_1se_s=n_cell_cv_1se_s)
 }
+t2 = Sys.time() #can us as.numeric(t1) to convert to seconds
+td = t2-t1
 close(pb)
+cat(paste("Total time: ",format(as.numeric(td))," ", attr(td,"units"),"\n"))
 if(cores_to_use>1) stopCluster(pr_cl)
 ```

@ -141,13 +146,16 @@ n_cell_cv_1se = apply(n_cell_cv_1se, c(2,3), mean)

 ```{r}
 print("CV n_cell mean")
-print(n_cell_cv_mean)
+print(n_cell_cv_mean) #Generally increases with k and as err_sd falls. With k because can grow deeper partitions (more chances for good anomolous results) on the majority of the data and mostly because it's now harder to estimate test (see next set where we do perfectly.). With this type of setup we'd always expect overly complex models (even if true DGP is more complicated than one split) as going below has true cost.
 #print("num sims with n_cell_cv>2")
 #print(n_cell_cv_over)
 print("CV (w/ oracle test effect) n_cell mean")
 print(n_cell_cv_tilde_mean)
-print("CV (rest to max_split=3) n_cell mean")
+print("CV (rest to max_split=3) n_cell mean") #fewer chances to get anomolously good results
 print(n_cell_cv_rest)
-print("CV (w/ lambda_1se) n_cell mean")
+print("CV (w/ lambda_1se) n_cell mean") #Does much better given the true objective function is flat .
 print(n_cell_cv_1se)
 ```
+```{r}
+n_cell_cv_1se
+```
--- a/project/sims.R
+++ b/project/sims.R
@ -2,15 +2,12 @@
 # - To change parallel/single-threaded, search for PARALLEL comments
 # - There is a "suffix" for output files so that one can test w/o overwritting

-# TODO:
-# - Switch to using my_apply from package so that I don't need separate doparallel package
-
 library(gsubfn)
 library(devtools)
 suppressPackageStartupMessages(library(data.table)) # suppressPackageStartupMessages(library(assertthat))
 library(CausalGrid)
 library(causalTree)
-library(doParallel)
+library(doSNOW)
 library(foreach)
 library(xtable)
 library(stringr)
@ -38,6 +35,7 @@ minsize=25
 S = 100 #3 100, TODO: Is this just 1!?
 Ns = c(500, 1000) #c(500, 1000)
 N_test = 8000
+N_bumps = 20
 D = 5 #3
 Ks = c(2, 2, 10, 20)
 good_features = list(c(T, F), c(T, T, rep(F, 8)), c(rep(T, 4), rep(F, 16)), c(T,T), c(T,T))
@ -47,7 +45,7 @@ NIters = NN*D*S


 #Execution config
-n_parallel = 5 #1 5; PARALLEL
+n_parallel = getOption("cl.cores", default=5) #1 5; PARALLEL
 my_seed = 1337
 set.seed(my_seed)
 rf.num.threads = 1 #NULL will multi-treatd, doesn't seem to help much with small data
@ -103,24 +101,24 @@ sim_eval_ct <- function(data1, data2, good_mask, honest=FALSE) {
  return(list(ct_fit, nl, mse, ngood, ntot))
 }

-sim_cg_fit <- function(y, X, w, tr_sample, verbosity=0, honest=FALSE, do_rf=FALSE, num.threads=rf.num.threads, num.trees=100, ...) {
+sim_cg_fit <- function(y, X, w, tr_sample, verbosity=0, do_rf=FALSE, num.threads=rf.num.threads, num.trees=100, ...) {
  set.seed(my_seed)
  if(do_rf) {
    return(fit_estimate_partition(y, X, d=w, tr_split=tr_sample, cv_folds=nfolds, verbosity=verbosity, 
                                  min_size=2*minsize, max_splits=10, bucket_min_d_var=TRUE, bucket_min_n=2*b, 
-                                  honest=honest, ctrl_method=grid_rf(num.threads=num.threads, num.trees=num.trees), 
+                                  ctrl_method=grid_rf(num.threads=num.threads, num.trees=num.trees), 
                                  nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
  }
  else {
    return(fit_estimate_partition(y, X, d=w, tr_split=tr_sample, cv_folds=nfolds, verbosity=verbosity, 
                                  min_size=2*minsize, max_splits=10, bucket_min_d_var=TRUE, bucket_min_n=2*b, 
-                                  honest=honest, nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
+                                  nsplits_k_warn_limit=NA, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE), ...))
  }
 }

 sim_cg_vectors <- function(grid_fit, mask, X_te, tau_te) {
  nsplits = grid_fit$partition$nsplits_by_dim
-  preds = predict.estimated_partition(grid_fit, new_X=X_te)
+  preds = predict(grid_fit, new_X=X_te)
  cg_mse = mean((preds - tau_te)^2)
  return(c(num_cells(grid_fit), cg_mse, sum(nsplits[mask]), sum(nsplits)))
 }
@ -175,11 +173,12 @@ ct_h_nl = results_ct_h[,4]
 # Eval CG -----
 cat("Eval CG\n")

-
+`%doChange%` = if(n_parallel>1) `%dopar%` else `%do%` #so I can just change variable without changing other code for single-threaded
 if(n_parallel>1) {
  if(file.exists(log_file)) file.remove(log_file)
  cl <- makeCluster(n_parallel, outfile=log_file)
-  registerDoParallel(cl)
+  #cl <- makeCluster(n_parallel)
+  registerDoSNOW(cl)
 }

 bar_length = if(n_parallel>1) NN*D else S*NN*D
@ -187,7 +186,6 @@ t1 = Sys.time()
 cat(paste("Start time: ",t1,"\n"))
 pb = utils::txtProgressBar(0, bar_length, style = 3)
 run=1
-N_bumps = 20
 outer_results = list()
 for(d in 1:D) {
  for(N_i in 1:NN) {
@ -195,9 +193,9 @@ for(d in 1:D) {
      utils::setTxtProgressBar(pb, run)
      run = run+1
    }
-    results_s = list() #PARALLEL: comment-out
+    #results_s = list() #PARALLEL: comment-out
    #for(s in 1:S){  #PARALLEL: comment-out, uncomment next
-    results_s = foreach(s=1:S, .packages=c("proto","gsubfn","rpart", "rpart.plot", "data.table","causalTree", "ranger",  "lattice", "ggplot2", "caret", "Matrix", "foreach", "CausalGrid"), .errorhandling = "pass") %dopar% { #, .combine=rbind
+    results_s = foreach(s=1:S, .packages=c("proto","gsubfn","rpart", "rpart.plot", "data.table","causalTree", "ranger",  "lattice", "ggplot2", "caret", "Matrix", "foreach", "CausalGrid"), .errorhandling = "pass") %doChange% { #, .combine=rbind
      if(n_parallel==1) {
        utils::setTxtProgressBar(pb, run)
        run = run+1
@ -210,11 +208,11 @@ for(d in 1:D) {
      tr_sample = c(rep(TRUE, N), rep(FALSE, N))
      
      
-      grid_a_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE)
-      grid_a_LassoCV_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, ctrl_method="LassoCV")
-      grid_a_RF_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, do_rf=TRUE)
-      grid_a_LassoCV_b_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, ctrl_method="LassoCV", bump_samples=N_bumps)
-      grid_a_RF_b_fit <- sim_cg_fit(y, X, w, tr_sample, honest=FALSE, do_rf=TRUE, bump_samples=N_bumps)
+      grid_a_fit <- sim_cg_fit(y, X, w, tr_sample)
+      grid_a_LassoCV_fit <- sim_cg_fit(y, X, w, tr_sample, ctrl_method="LassoCV")
+      grid_a_RF_fit <- sim_cg_fit(y, X, w, tr_sample, do_rf=TRUE)
+      grid_a_LassoCV_b_fit <- sim_cg_fit(y, X, w, tr_sample, ctrl_method="LassoCV", bump_samples=N_bumps)
+      grid_a_RF_b_fit <- sim_cg_fit(y, X, w, tr_sample, do_rf=TRUE, bump_samples=N_bumps)

      grid_a_am_fit <- change_complexity(grid_a_fit, y, X, d=w, which.min(abs(ct_a_nl[iter] - (grid_a_fit$complexity_seq + 1))))
      grid_a_LassoCV_b_hm_fit <- change_complexity(grid_a_LassoCV_b_fit, y, X, d=w, which.min(abs(ct_h_nl[iter] - (grid_a_LassoCV_b_fit$complexity_seq + 1))))