Initial Commit

2018-05-18 14:08:01 -07:00 · 2018-05-18 14:08:01 -07:00 · b112ca5318
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,104 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
--- a/README.md
+++ b/README.md
@ -0,0 +1,15 @@
+
+# Ridge Synthetic Controls
+
+### Setup
+
+This package requires [numpy](http://www.numpy.org/) and
+[scipy](https://www.scipy.org/) and has been tested with ( Python 2.7.14,
+Numpy 1.14.1, and Scipy 1.0.0 )  and ( Python 3.5.5, Numpy 1.13.1, and
+Scipy 1.0.1 )
+
+### API
+
+For now, refer to the example code in `./example-code.py` for usage
+details.
+
--- a/init.py
+++ b/init.py
@ -0,0 +1,10 @@
+
+# PRIMARY FITTING FUNCTIONS
+from fit_loo import loo_v_matrix, loo_weights, loo_score
+from fit_ct import ct_v_matrix, ct_weights, ct_score
+
+# Public API
+from cross_validation import score_train_test, score_train_test_sorted_lambdas, CV_score
+from tensor import tensor
+from weights import weights
+from lambda_utils import get_max_lambda, L2_pen_guestimate
--- a/cross_validation.py
+++ b/cross_validation.py
@ -0,0 +1,299 @@
+
+from .fit_loo import  loo_v_matrix, loo_score
+from .fit_ct import  ct_v_matrix, ct_score
+from .optimizers.cd_line_search import cdl_search
+import numpy as np
+from concurrent import futures
+
+def score_train_test(X, 
+                      Y,
+                      train,
+                      test,
+                      X_treat=None,
+                      Y_treat=None,
+                      FoldNumber=None, # for consistency with score_train_test_sorted_lambdas()
+                      **kwargs):
+    """ presents a unified api for ct_v_matrix and loo_v_matrix
+        and returns the v_mat, l2_pen_w (possibly calculated, possibly a parameter), and the score 
+    """
+    # to use `pdb.set_trace()` here, set `parallel = False` above
+    if X_treat is None != Y_treat is None:
+        raise ValueError("parameters `X_treat` and `Y_treat` must both be Matrices or None")
+
+    if X_treat is not None:
+        ">> K-fold validation on the Treated units; assuming that Y and Y_treat are pre-intervention outcomes"
+
+        # PARAMETER QC
+        if not isinstance(X_treat, np.matrix): raise TypeError("X_treat is not a matrix")
+        if not isinstance(Y_treat, np.matrix): raise TypeError("Y_treat is not a matrix")
+        if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+        if Y_treat.shape[1] == 0: raise ValueError("Y_treat.shape[1] == 0")
+        if X_treat.shape[0] != Y_treat.shape[0]: 
+            raise ValueError("X_treat and Y_treat have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+        # note that the weights, score, and loss function value returned here are for the in-sample predictions
+        weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                    ct_v_matrix(X = np.vstack((X,X_treat[train, :])),
+                                Y = np.vstack((Y,Y_treat[train, :])),
+                                treated_units = [X.shape[0] + i for i in  range(len(train))],
+                                method = cdl_search,
+                                **kwargs)
+
+                    
+        # GET THE OUT-OF-SAMPLE PREDICTION ERROR
+        s = ct_score(X = np.vstack((X,X_treat[test, :])),
+                     Y = np.vstack((Y,Y_treat[test, :])), 
+                     treated_units = [X.shape[0] + i for i in  range(len(test))],
+                     V = v_mat,
+                     L2_PEN_W = l2_pen_w)
+
+        
+        print("LAMBDA: %0.1f, zeros %s (of %s), Score: %0.1f / %0.1f " % (kwargs["LAMBDA"],
+                                                                          sum(np.diag(v_mat == 0)),
+                                                                          v_mat.shape[0],
+                                                                          s,
+                                                                          np.power(Y_treat[test, :] - np.mean(Y_treat[test, :]),2).sum() ))
+
+    else: 
+        ">> K-fold validation on the only control units; assuming that Y contains post-intervention outcomes"
+
+        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+        # note that the weights, score, and loss function value returned here are for the in-sample predictions
+        weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                loo_v_matrix(X = X[train, :],
+                             Y = Y[train, :], 
+                             # treated_units = [X.shape[0] + i for i in  range(len(train))],
+                             method = cdl_search,
+                             **kwargs)
+
+        # GET THE OUT-OF-SAMPLE PREDICTION ERROR
+        s = loo_score(X = X, Y = Y, 
+                      treated_units = test,
+                      V = v_mat,
+                      L2_PEN_W = l2_pen_w)
+
+    return v_mat, l2_pen_w, s
+
+
+def score_train_test_sorted_lambdas(
+            LAMBDA,
+            start=None,
+            cache=False,
+            progress=False,
+            FoldNumber=None,
+            **kwargs):
+    """ a wrapper which calls  score_train_test() for each element of an
+        array of `LAMBDA`'s, optionally caching the optimized v_mat and using it
+        as the start position for the next iteration.
+    """
+
+    # DEFAULTS
+    values = [None]*len(LAMBDA)
+
+    if progress > 0:
+        import time
+        t0 = time.time()
+
+    for i,Lam in enumerate(LAMBDA):
+        v_mat, w, s = values[i] = score_train_test( LAMBDA = Lam, start = start, **kwargs)
+
+        if cache: 
+            start = np.diag(v_mat)
+        if progress > 0 and (i % progress) == 0:
+            t1 = time.time() 
+            if FoldNumber is None:
+                print("iteration %s of %s time: %0.4f ,lambda: %0.4f" % (i+1, len(LAMBDA), t1 - t0, Lam,))
+                #print("iteration %s of %s time: %0.4f ,lambda: %0.4f, diags: %s" % (i+1, len(LAMBDA), t1 - t0, Lam, np.diag(v_mat),))
+            else:
+                print("Fold %s, iteration %s of %s, time: %0.4f ,lambda: %0.4f" % (FoldNumber, i+1, len(LAMBDA), t1 - t0, Lam, ))
+                #print("Fold %s, iteration %s of %s, time: %0.4f ,lambda: %0.4f, diags: %s" % (FoldNumber, i+1, len(LAMBDA), t1 - t0, Lam, np.diag(v_mat),))
+            t0 = time.time() 
+
+    return list(zip(*values))
+
+
+def CV_score(X,Y,LAMBDA,X_treat=None,Y_treat=None,splits=5,sub_splits=None,quiet=False,parallel=False,max_workers=None,**kwargs):
+    """ Cross fold validation for 1 or more L1 Penalties, holding the L2 penalty fixed. 
+    """
+
+    # PARAMETER QC
+    if not isinstance(X, np.matrix): raise TypeError("X is not a matrix")
+    if not isinstance(Y, np.matrix): raise TypeError("Y is not a matrix")
+    if X_treat is None != Y_treat is None:
+        raise ValueError("parameters `X_treat` and `Y_treat` must both be Matrices or None")
+    if X.shape[1] == 0: raise ValueError("X.shape[1] == 0")
+    if Y.shape[1] == 0: raise ValueError("Y.shape[1] == 0")
+    if X.shape[0] != Y.shape[0]: raise ValueError("X and Y have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+    try:
+        _LAMBDA = iter(LAMBDA)
+    except TypeError:
+        # Lambda is a single value 
+        multi_lambda = False
+        __score_train_test__ = score_train_test
+    else:
+        # Lambda is an iterable of values
+        multi_lambda = True
+        __score_train_test__ = score_train_test_sorted_lambdas
+
+
+    if X_treat is not None:
+
+        # PARAMETER QC
+        if not isinstance(X_treat, np.matrix): raise TypeError("X_treat is not a matrix")
+        if not isinstance(Y_treat, np.matrix): raise TypeError("Y_treat is not a matrix")
+        if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+        if Y_treat.shape[1] == 0: raise ValueError("Y_treat.shape[1] == 0")
+        if X_treat.shape[0] != Y_treat.shape[0]: 
+            raise ValueError("X_treat and Y_treat have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+        # MESSAGING
+        if not quiet: 
+            print("K-fold validation with %s control and %s treated units %s predictors and %s outcomes, holding out one fold among Treated units; Assumes that `Y` and `Y_treat` are pre-intervention outcomes" % 
+                    (X.shape[0] , X_treat.shape[0],X.shape[1],Y.shape[1],))
+
+        if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
+            from sklearn.model_selection import KFold
+            splits = KFold(splits).split(np.arange(X_treat.shape[0]))
+        train_test_splits = [ x for x in splits ]
+        n_splits = len(train_test_splits)
+
+
+        if parallel: 
+
+            if max_workers is None:
+                # CALCULATE A DEFAULT FOR MAX_WORKERS
+                import multiprocessing
+                multiprocessing.cpu_count()
+                if n_splits == 1:
+                    print("WARNING: Using Parallel options with a single split is expected reduce performance")
+                max_workers = min(max(multiprocessing.cpu_count() - 2,1),len(train_test_splits))
+                if max_workers == 1 and n_splits > 1:
+                    print("WARNING: Default for max_workers is 1 on a machine with %s cores is 1.")
+
+            _initialize_Global_worker_pool(max_workers)
+
+            try:
+
+                promises = [ _worker_pool.submit( __score_train_test__,
+                                                X = X,
+                                                Y = Y,
+                                                LAMBDA = LAMBDA,
+                                                X_treat = X_treat, 
+                                                Y_treat = Y_treat, 
+                                                train = train,
+                                                test = test,
+                                                FoldNumber = fold,
+                                                **kwargs)
+                            for fold, (train,test) in enumerate(train_test_splits) ] 
+                results = [ promise.result() for promise in futures.as_completed(promises)]
+
+            finally:
+
+                _clean_up_worker_pool()
+
+        else:
+
+            results = [ __score_train_test__(X = X,
+                                             Y = Y,
+                                             X_treat = X_treat, 
+                                             Y_treat = Y_treat, 
+                                             LAMBDA = LAMBDA,
+                                             train = train,
+                                             test = test,
+                                             FoldNumber = fold,
+                                             **kwargs)
+                            for fold, (train,test) in enumerate(train_test_splits) ] 
+
+
+    else: # X_treat *is* None
+        # MESSAGING
+        if not quiet: 
+            print("Leave-one-out Cross Validation with %s control units, %s predictors and %s outcomes; Y may contain post-intervention outcomes" % 
+                    (X.shape[0],X.shape[1],Y.shape[1],) )
+
+        if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
+            from sklearn.model_selection import KFold
+            splits = KFold(splits).split(np.arange(X.shape[0]))
+        train_test_splits = [ x for x in splits ]
+        n_splits = len(train_test_splits)
+
+        if parallel: 
+
+            if max_workers is None:
+                # CALCULATE A DEFAULT FOR MAX_WORKERS
+                import multiprocessing
+                multiprocessing.cpu_count()
+                if n_splits == 1:
+                    print("WARNING: Using Parallel options with a single split is expected reduce performance")
+                max_workers = min(max(multiprocessing.cpu_count() - 2,1),len(train_test_splits))
+                if max_workers == 1 and n_splits > 1:
+                    print("WARNING: Default for max_workers is 1 on a machine with %s cores is 1.")
+
+            _initialize_Global_worker_pool(max_workers)
+
+            try:
+
+                promises = [ _worker_pool.submit(__score_train_test__,
+                                                 X = X,
+                                                 Y = Y,
+                                                 LAMBDA = LAMBDA,
+                                                 train = train,
+                                                 test = test,
+                                                 FoldNumber = fold,
+                                                 **kwargs)
+                            for fold, (train,test) in enumerate(train_test_splits) ] 
+
+                results = [ promise.result() for promise in futures.as_completed(promises)]
+
+            finally:
+
+                _clean_up_worker_pool()
+
+        else:
+            results = [ __score_train_test__(X = X,
+                                             Y = Y,
+                                             LAMBDA = LAMBDA,
+                                             train = train,
+                                             test = test,
+                                             FoldNumber = fold,
+                                             **kwargs)
+                            for fold, (train,test) in enumerate(train_test_splits) ] 
+
+    # extract the score.
+    v_mats, l2_pens, scores = list(zip(* results))
+
+    if multi_lambda:
+        total_score = [sum(s) for s in zip(*scores)]
+    else:
+        total_score = sum(scores)
+
+    return total_score
+
+
+# ------------------------------------------------------------
+# utilities for maintaining a worker pool
+# ------------------------------------------------------------
+
+_worker_pool = None
+
+def _initialize_Global_worker_pool(n_workers):
+    global _worker_pool
+
+    if _worker_pool is not None:
+        return # keep it itempotent, please
+
+    _worker_pool = futures.ProcessPoolExecutor(max_workers=n_workers)
+
+def _clean_up_worker_pool():
+    global _worker_pool
+
+    if _worker_pool is not None:
+        _worker_pool.shutdown()
+        _worker_pool = None
+
+import atexit
+atexit.register(_clean_up_worker_pool)
+
+
--- a/example-code.py
+++ b/example-code.py
@ -0,0 +1,436 @@
+"""
+USAGE: 
+
+cd path/to/RidgeSC
+python example-code.py
+"""
+
+import os
+import sys
+sys.path.append(os.path.join(os.getcwd(),".."))
+
+import time
+import RidgeSC as SC
+import numpy as np
+import random
+
+
+# def setup(C,N,T,K,g,gs,bs ): # controls (C), treated units (N) , time periods (T), Predictors (K), groups (g), g-scale, b-scale
+
+if __name__ == "__main__":
+
+    # ------------------------------------------------------------
+    # CONTROL PARAMETERS
+    # ------------------------------------------------------------
+
+    random.seed(12345)
+    np.random.seed(10101)
+
+    # Controls (per group), Treated (per group), pre-intervention Time points,  post intervention time-periods
+    C,N,T0,T1 = 5,5,4,4
+    # Causal Covariates, Confounders , Random Covariates 
+    K,S,R = 7,4,5
+    # Number of Groups, Scale of the Group Effect, (groups make the time series correlated for reasons other than the observed covariates...)
+    groups,group_scale = 30,2
+    # Scale of the Correlation of the Causal Covariates
+    beta_scale,confounders_scale = 4,1
+
+    # ------------------------------------------------------------
+    # SIMULATE SOME DATA
+    # ------------------------------------------------------------
+
+
+    # COVARIATE EFFECTS
+    X_control = np.matrix(np.random.normal(0,1,((C)*groups, K+S+R)))
+    X_treated = np.matrix(np.random.normal(0,1,((N)*groups, K+S+R)))
+
+    # CAUSAL 
+    b_cause = np.random.exponential(1,K)
+    b_cause *= beta_scale / b_cause.max()
+
+    # CONFOUNDERS 
+    b_confound = np.random.exponential(1,S)
+    b_confound *= confounders_scale / b_confound.max()
+
+    beta_control = np.matrix(np.concatenate( ( b_cause ,b_confound,  np.zeros(R)) ) ).T
+    beta_treated = np.matrix(np.concatenate( ( b_cause ,np.zeros(S), np.zeros(R)) ) ).T
+
+    # GROUP EFFECTS (hidden)
+
+    Y_pre_group_effects = np.random.normal(0,group_scale,(groups,T0))
+    Y_pre_ge_control = Y_pre_group_effects[np.repeat(np.arange(groups),C)]
+    Y_pre_ge_treated = Y_pre_group_effects[np.repeat(np.arange(groups),N)]
+
+    Y_post_group_effects = np.random.normal(0,group_scale,(groups,T1))
+    Y_post_ge_control = Y_post_group_effects[np.repeat(np.arange(groups),C)]
+    Y_post_ge_treated = Y_post_group_effects[np.repeat(np.arange(groups),N)]
+
+    # RANDOM ERRORS
+    Y_pre_err_control = np.matrix(np.random.random( ( C*groups, T0, ) )) 
+    Y_pre_err_treated = np.matrix(np.random.random( ( N*groups, T0, ) )) 
+
+    Y_post_err_control = np.matrix(np.random.random( ( C*groups, T1, ) )) 
+    Y_post_err_treated = np.matrix(np.random.random( ( N*groups, T1, ) )) 
+
+    # THE DATA GENERATING PROCESS
+    model = "full"
+
+    if model == "full":
+        """ In the full model, covariates (X) are correlated with pre and post
+            outcomes, and variance of the outcomes pre- and post- outcomes is
+            lower within groups which span both treated and control units.
+        """
+        Y_pre_control = X_control.dot(beta_control) + Y_pre_ge_control + Y_pre_err_control 
+        Y_pre_treated = X_treated.dot(beta_treated) + Y_pre_ge_treated + Y_pre_err_treated 
+
+        Y_post_control = X_control.dot(beta_control) + Y_post_ge_control + Y_post_err_control 
+        Y_post_treated = X_treated.dot(beta_treated) + Y_post_ge_treated + Y_post_err_treated 
+
+    elif model == "hidden":
+        """ In the hidden model outcomes are independent of the covariates, but
+            variance of the outcomes pre- and post- outcomes is lower within
+            groups which span both treated and control units.
+        """
+        Y_pre_control = Y_pre_ge_control + Y_pre_err_control 
+        Y_pre_treated = Y_pre_ge_treated + Y_pre_err_treated 
+
+        Y_post_control = Y_post_ge_control + Y_post_err_control 
+        Y_post_treated = Y_post_ge_treated + Y_post_err_treated 
+
+    elif model == "null":
+        "Purely random data" 
+        Y_pre_control = Y_pre_err_control 
+        Y_pre_treated = Y_pre_err_treated 
+
+        Y_post_control = Y_post_err_control 
+        Y_post_treated = Y_post_err_treated 
+
+    else:
+        raise ValueError("Unknown model type: "+model)
+
+    # JOIN THE TREAT AND CONTROL DATA
+    X = np.vstack( (X_control, X_treated,) )
+    Y_pre  = np.vstack( (Y_pre_control,  Y_pre_treated, ) )
+    Y_post = np.vstack( (Y_post_control, Y_post_treated,) )
+
+    # in the leave-one-out scneario, the pre-treatment outcomes will be part of the covariates
+    X_and_Y_pre = np.hstack( ( X, Y_pre,) )
+
+    # IDENTIFIERS FOR TREAT AND CONTROL UNITS
+    # control_units = np.arange( C * groups )
+    # treated_units = np.arange( N * groups ) + C
+
+    # ------------------------------------------------------------
+    # find default penalties
+    # ------------------------------------------------------------
+
+    # get starting point for the L2 penalty 
+    L2_pen_start_ct  = SC.L2_pen_guestimate(X_control)
+    L2_pen_start_loo = SC.L2_pen_guestimate(X_and_Y_pre)
+
+    # get the maximum value for the L1 Penalty parameter conditional on the guestimate for the L2 penalty
+    L1_max_ct  = SC.get_max_lambda(X_control,Y_pre_control,X_treat=X_treated,Y_treat=Y_pre_treated)
+    L1_max_loo = SC.get_max_lambda(X_and_Y_pre,Y_post)
+    print("Max L1")
+
+    if False:
+        "Demonstrate relations between the L1 and L2 penalties"
+
+        # get the maximum value for the L1 Penalty parameter conditional on several L2 penalty parameter values
+        L2_grid = (2.** np.arange(-1,2))
+
+        L1_max_loo_grid = SC.get_max_lambda(X_and_Y_pre,
+                                           Y_post, 
+                                           L2_PEN_W = L2_pen_start_loo * L2_grid)
+        L1_max_ct_grid = SC.get_max_lambda(X_control,
+                                           Y_pre_control,
+                                           X_treat=X_treated,
+                                           Y_treat=Y_pre_treated,
+                                           L2_PEN_W = L2_pen_start_ct * L2_grid)
+        assert ( L1_max_loo / L1_max_loo_grid  == 1/L2_grid).all()
+        assert ( L1_max_ct  / L1_max_ct_grid   == 1/L2_grid).all()
+
+
+    # ------------------------------------------------------------
+    # create a grid of penalties to try 
+    # ------------------------------------------------------------
+    n_points = 10 # number of points in the grid
+
+    grid_type = "log-linear" # Usually the optimal penalties are quite Small
+
+    if grid_type == "simple":
+        # An equally spaced linear grid that does not include 0 or 1
+        grid = ( 1 + np.arange(n_points) ) / ( 1 + n_points )
+    elif grid_type == "linear":
+        # Another equally spaced linear grid
+        fmax = 1e-3 # lowest point in the grid relative to the max-lambda
+        fmin = 1e-5 # highest point in the grid relative to the max-lambda
+        grid = np.linspace(fmin,fmax,n_points)
+    elif grid_type == "log-linear":
+        # Another equally spaced linear grid
+        fmax = 1e-2 # lowest point in the grid relative to the max-lambda
+        fmin = 1e-4 # highest point in the grid relative to the max-lambda
+        grid = np.exp(np.linspace(np.log(fmin),np.log(fmax),n_points))
+    else:
+        raise ValueError("Unknown grid type: %s" % grid_type)
+
+
+    # ------------------------------------------------------------
+    # get the Cross Validation Error over a grid of L1 penalty
+    # parameters using both (a) Treat/Control and (b) the 
+    # leave-one-out  controls only methods
+    # ------------------------------------------------------------
+
+    if False:
+        print("starting grid scoring for treat / control scenario", grid*L1_max_ct)
+        grid_scores_ct = SC.CV_score(
+            X = X_control,
+            Y = Y_pre_control,
+
+            X_treat = X_treated,
+            Y_treat = Y_pre_treated,
+
+            # if LAMBDA is a single value, we get a single score, If it's an array of values, we get an array of scores.
+            LAMBDA = grid * L1_max_ct,
+            L2_PEN_W = L2_pen_start_ct,
+
+            # CACHE THE V MATRIX BETWEEN LAMBDA PARAMETERS (generally faster, but path dependent)
+            cache = False, # False by Default
+
+            # Run each of the Cross-validation folds in parallel? Often slower
+            # for large sample sizes because numpy.linalg.solve() already runs
+            # in parallel for large matrices
+            parallel=False,
+
+            # ANNOUNCE COMPLETION OF EACH ITERATION
+            progress = True)
+
+        best_L1_penalty_ct = (grid * L1_max_ct)[np.argmin(grid_scores_ct)]
+
+    if False:
+        # even with smaller data, this takes a while.
+        print("starting grid scoring for Controls Only scenario", grid*L1_max_ct)
+        grid_scores_loo = SC.CV_score(
+            X = X_and_Y_pre [np.arange(100)], # limit the amount of time...
+            Y = Y_post      [np.arange(100)], # limit the amount of time...
+
+            # L1 Penalty. if LAMBDA is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
+            LAMBDA = grid * L1_max_loo,
+
+            # L2 Penalty (float)
+            L2_PEN_W = L2_pen_start_loo,
+
+            # CACHE THE V MATRIX BETWEEN LAMBDA PARAMETERS (generally faster, but path dependent)
+            #cache = True, # False by Default
+
+            # Run each of the Cross-validation folds in parallel? Often slower
+            # for large sample sizes because numpy.linalg.solve() already runs
+            # in parallel for large matrices
+            parallel=False,
+
+            # ANNOUNCE COMPLETION OF EACH ITERATION
+            progress = True)
+
+
+    # ---------------------------------------------------------------------------
+    # Calculate Synthetic Control weights for a fixed pair of penalty parameters
+    # ---------------------------------------------------------------------------
+
+    # This is a two-step process because in principle, we can estimate a 
+    # tensor matrix (which contains the relative weights of the covariates and
+    # possibly pre-treatment outcomes) in one population, and apply it to
+    # another population.
+    best_L1_penalty_ct = np.float(1908.9329)
+
+    # -----------------------------------
+    # Treat/Control:
+    # -----------------------------------
+
+    V_ct = SC.tensor(X = X_control,
+                     Y = Y_pre_control,
+                     X_treat = X_treated,
+                     Y_treat = Y_pre_treated,
+                     LAMBDA = best_L1_penalty_ct,
+                     L2_PEN_W = L2_pen_start_ct)
+
+    SC_weights_ct = SC.weights(X = X_control,
+                               X_treat = X_treated,
+                               V = V_ct,
+                               L2_PEN_W = L2_pen_start_ct)
+
+    Y_post_treated_synthetic_conrols_ct = SC_weights_ct.dot(Y_post_control)
+    ct_prediction_error = Y_post_treated_synthetic_conrols_ct - Y_post_treated
+    null_model_error = Y_post_treated - np.mean(Y_pre_treated)
+    R_squared_post_ct = 1 - np.power(ct_prediction_error,2).sum() / np.power(null_model_error,2).sum()
+
+    print( "C/T: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_ct,))
+
+
+    # -----------------------------------
+    # Leave-One-Out with Control Only:
+    # -----------------------------------
+
+    if False: 
+        # this takes a while:
+        V_loo = SC.tensor(
+                X = X_and_Y_pre [np.arange(100)], # limit the amount of time...
+                Y = Y_post      [np.arange(100)], # limit the amount of time...
+                LAMBDA = best_L1_penalty_ct,
+                L2_PEN_W = L2_pen_start_loo)
+
+        SC_weights_loo = SC.weights(X = X_control,
+                                    V = V_ct,
+                                    L2_PEN_W = L2_pen_start_loo)
+
+        # in progress...
+        import pdb; pdb.set_trace()
+
+        Y_post_treated_synthetic_conrols_loo = SC_weights_ct.dot(Y_post_control)
+        loo_prediction_error = Y_post_treated_synthetic_conrols_loo - Y_post_treated
+        null_model_error = Y_post_treated - np.mean(Y_pre_treated)
+        R_squared_post_ct = 1 - np.power(loo_prediction_error,2).sum() / np.power(null_model_error,2).sum()
+
+        print( "LOO: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_ct,))
+
+        import pdb; pdb.set_trace()
+
+
+
+    # ---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
+    # Optimization of the L1 and L2 parameters together (second order)
+    # ---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
+
+    from scipy.optimize import fmin_l_bfgs_b, differential_evolution 
+    import time
+
+    # -----------------------------------------------------------------
+    # Optimization of the L2 and L1 Penalties Simultaneously, keeping their product constant
+    # -----------------------------------------------------------------
+
+    # build the objective function to be minimized
+
+    # cache for L2_obj_func
+    n_calls = [0,]
+    temp_results =[]
+    SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
+    best_L1_penalty_ct = np.float(1908.9329)
+
+    def L1_L2_obj_func (x): 
+        n_calls[0] += 1
+        t1 = time.time();
+        score = SC.CV_score(X = X_control,
+                            Y = Y_pre_control,
+                            X_treat = X_treated,
+                            Y_treat = Y_pre_treated,
+                            # if LAMBDA is a single value, we get a single score, If it's an array of values, we get an array of scores.
+                            LAMBDA = best_L1_penalty_ct * np.exp(x[0]),
+                            L2_PEN_W = L2_pen_start_ct / np.exp(x[0]),
+                            # suppress the analysis type message
+                            quiet = True)
+        t2 = time.time(); 
+        temp_results.append((n_calls[0],x,score))
+        print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" % (n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
+        #print("calls: %s, time: %0.4f, x0: %0.4f, x1: %0.4f, Cross Validation Error: %s, R-Squared: %s" % (n_calls[0], t2 - t1, x[0], x[1], score, 1 - score / SS ))
+        return score
+
+    # the actual optimization
+    print("Starting L2 Penalty optimization")
+
+    results = differential_evolution(L1_L2_obj_func, bounds = ((-6,6,),)  )
+    #results = differential_evolution(L1_L2_obj_func, bounds = ((-6,6,),)*2  )
+
+    import pdb; pdb.set_trace()
+    NEW_best_L1_penalty_ct = best_L1_penalty_ct * np.exp(results.x[0])
+    best_L2_penalty = L2_pen_start_ct * np.exp(results.x[1])
+
+    print("DE optimized L2 Penalty: %s, DE optimized  L1 penalty: %s"  % (NEW_best_L1_penalty_ct, best_L2_penalty,) )
+
+    # -----------------------------------------------------------------
+    # Optimization of the L2 Parameter alone
+    # -----------------------------------------------------------------
+
+    # -----------------------
+    # build the objective function to be minimized
+    # -----------------------
+
+    # OBJECTIVE FUNCTION(S) TO MINIMIZE USING DE
+
+    # cache for L2_obj_func
+    n_calls = [0,]
+    temp_results =[]
+    SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
+    best_L1_penalty_ct = np.float(1908.9329)
+
+    def L2_obj_func (x): 
+        n_calls[0] += 1
+        t1 = time.time();
+
+        score = SC.CV_score(X = X_control,
+                            Y = Y_pre_control,
+                            X_treat = X_treated,
+                            Y_treat = Y_pre_treated,
+                            # if LAMBDA is a single value, we get a single score, If it's an array of values, we get an array of scores.
+                            LAMBDA = best_L1_penalty_ct,
+                            L2_PEN_W = L2_pen_start_ct * np.exp(x[0]),
+                            # suppress the analysis type message
+                            quiet = True)
+        
+        t2 = time.time(); 
+        temp_results.append((n_calls[0],x,score))
+        print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" %
+                (n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
+        return score
+
+    # the actual optimization
+    print("Starting L2 Penalty optimization")
+
+    results = differential_evolution(L2_obj_func, bounds = ((-6,6,),))
+    NEW_L2_pen_start_ct = L2_pen_start_ct * np.exp(results.x[1])
+    print("DE optimized L2 Penalty: %s, using fixed L1 penalty: %s"  % (NEW_L2_pen_start_ct, best_L1_penalty_ct,) )
+
+    # -----------------------------------------------------------------
+    # Optimization of the L2 and L1 Penalties Simultaneously
+    # -----------------------------------------------------------------
+
+    # build the objective function to be minimized
+
+    # cache for L2_obj_func
+    n_calls = [0,]
+    temp_results =[]
+    SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
+    best_L1_penalty_ct = np.float(1908.9329)
+
+    def L1_L2_obj_func (x): 
+        n_calls[0] += 1
+        t1 = time.time();
+        score = SC.CV_score(X = X_control,
+                            Y = Y_pre_control,
+                            X_treat = X_treated,
+                            Y_treat = Y_pre_treated,
+                            # if LAMBDA is a single value, we get a single score, If it's an array of values, we get an array of scores.
+                            LAMBDA = best_L1_penalty_ct * np.exp(x[0]),
+                            L2_PEN_W = L2_pen_start_ct * np.exp(x[1]),
+                            # suppress the analysis type message
+                            quiet = True)
+        t2 = time.time(); 
+        temp_results.append((n_calls[0],x,score))
+        print("calls: %s, time: %0.4f, x0: %0.4f, x1: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" % (n_calls[0], t2 - t1, x[0], x[1], score, 1 - score / SS ))
+        return score
+
+    # the actual optimization
+    print("Starting L2 Penalty optimization")
+
+    results = differential_evolution(L1_L2_obj_func, bounds = ((-6,6,),)*2  )
+
+    import pdb; pdb.set_trace()
+    NEW_best_L1_penalty_ct = best_L1_penalty_ct * np.exp(results.x[0])
+    best_L2_penalty = L2_pen_start_ct * np.exp(results.x[1])
+
+    print("DE optimized L2 Penalty: %s, DE optimized  L1 penalty: %s"  % (NEW_best_L1_penalty_ct, best_L2_penalty,) )
+
+
+
+
+
--- a/fit_ct.py
+++ b/fit_ct.py
@ -0,0 +1,175 @@
+from numpy import dot, ones, diag, matrix, zeros, array,absolute, mean,var, linalg, prod,shape,sqrt
+import numpy as np
+import pandas as pd
+import itertools
+import timeit
+import warnings
+from .optimizers.cd_line_search import cdl_search
+warnings.filterwarnings('ignore')
+
+def ct_v_matrix(X,
+                Y,
+                LAMBDA = 0,
+                treated_units = None,
+                control_units = None,
+                start = None,
+                L2_PEN_W = None,
+                method = cdl_search, 
+                intercept = True,
+                max_lambda = False,  # this is terrible at least without documentation...
+                **kwargs):
+    '''
+    Computes and sets the optimal v_matrix for the given moments and 
+        penalty parameter.
+
+    :param X: Matrix of Covariates
+    :param Y: Matrix of Outcomes
+    :param LAMBDA: penalty parameter used to shrink L1 norm of v/v.max() toward zero
+    :param treated_units: a list containing the position (rows) of the treated units within X and Y
+    :param control_units: a list containing the position (rows) of the control units within X and Y
+    :param start: initial values for the diagonals of the tensor matrix
+    :param L2_PEN_W: L2 penalty on the magnitude of the deviance of the weight vector from null. Optional.
+    :param method: The name of a method to be used by scipy.optimize.minimize, or a callable with the same API as scipy.optimize.minimize
+    :param intercept: If True, weights are penalized toward the 1 / the number of controls, else weights are penalized toward zero
+    :;aram max_lambda: if True, the return value is the maximum L1 penalty for which at least one element of the tensor matrix is non-zero
+    :param **kwargs: additional arguments passed to the optimizer
+
+    '''
+    # DEFAULTS
+    if treated_units is None: 
+        if control_units is None: 
+            raise ValueError("At least on of treated_units or control_units is required")
+        # Set the treated units to the not-control units
+        treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    if control_units is None: 
+        control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+
+    # Parameter QC
+    if len(set(treated_units).intersection(control_units)):
+        raise ValueError("Treated and Control units must be exclusive")
+    if not isinstance(X, matrix): raise TypeError("X is not a matrix")
+    if not isinstance(Y, matrix): raise TypeError("Y is not a matrix")
+    if X.shape[1] == 0: raise ValueError("X.shape[1] == 0")
+    if Y.shape[1] == 0: raise ValueError("Y.shape[1] == 0")
+    if X.shape[0] != Y.shape[0]: raise ValueError("X and Y have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+    if not isinstance(LAMBDA, (float, int)): raise TypeError( "LAMBDA is not a number")
+    if L2_PEN_W is None: L2_PEN_W = mean(var(X, axis = 0))
+    if not isinstance(L2_PEN_W, (float, int)): raise TypeError( "L2_PEN_W is not a number")
+
+    # CONSTANTS
+    C, N, K = len(control_units), len(treated_units), X.shape[1]
+    if start is None: start = zeros(K) # formerly: .1 * ones(K) 
+    Y_treated = Y[treated_units,:]
+    Y_control = Y[control_units,:]
+    X_treated = X[treated_units,:]
+    X_control = X[control_units,:]
+
+    if intercept: 
+        Y_treated -= Y_control.mean(axis=0) 
+
+    # INITIALIZE PARTIAL DERIVATIVES
+    dA_dV_ki = [ X_control[:, k ].dot(X_control[:, k ].T) +  # i,j are on the diagonal (both equal to k)
+                 X_control[:, k ].dot(X_control[:, k ].T) for k in range(K)] # 8
+    dB_dV_ki = [ X_control[:, k ].dot(X_treated[:, k ].T) +  # i,j are on the diagonal (both equal to k)
+                 X_control[:, k ].dot(X_treated[:, k ].T) for k in range(K)] # 9
+
+    def _score(V):
+        dv = diag(V)
+        weights, _, _ ,_ = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        return ((Ey **2).sum() + LAMBDA * absolute(V).sum()).copy() # (...).copy() assures that x.flags.writeable is True
+
+    def _grad(V):
+        """ Calculates just the diagonal of dGamma0_dV
+
+            There is an implementation that allows for all elements of V to be varied...
+        """
+        dv = diag(V)
+        weights, A, B, AinvB = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        dGamma0_dV_term2 = zeros(K)
+        dPI_dV = zeros((C, N))
+        #Ai = A.I
+        for k in range(K):
+            dPI_dV.fill(0) # faster than re-allocating the memory each loop.
+            dA = dA_dV_ki[k]
+            dB = dB_dV_ki[k]
+            dPI_dV = linalg.solve(A,(dB - dA.dot(AinvB))) 
+            #dPI_dV = Ai.dot(dB - dA.dot(AinvB))
+            dGamma0_dV_term2[k] = (Ey * Y_control.T.dot(dPI_dV).T.getA()).sum()
+        return LAMBDA - 2 * dGamma0_dV_term2
+
+    L2_PEN_W_mat = 2 * L2_PEN_W * diag(ones(X_control.shape[0]))
+    def _weights(V):
+        weights = zeros((C, N))
+        A = X_control.dot(2*V).dot(X_control.T) + L2_PEN_W_mat # 5
+        B = X_treated.dot(2*V).dot(X_control.T).T # 6
+        b = linalg.solve(A,B)
+        if intercept:
+            weights = b + 1/C
+        else:
+            weights = b
+        return weights, A, B,b
+
+    if max_lambda:
+        grad0 = _grad(zeros(K))
+        return -grad0[grad0 < 0].min()
+
+    # DO THE OPTIMIZATION
+    if isinstance(method, str):
+        from scipy.optimize import minimize
+        opt = minimize(_score, start.copy(), jac = _grad, method = method, **kwargs)
+    else:
+        assert callable(method), "Method must be a valid method name for scipy.optimize.minimize or a minimizer"
+        opt = method(_score, start.copy(), jac = _grad, **kwargs)
+    v_mat = diag(opt.x)
+
+    # CALCULATE weights AND ts_score
+    weights, _, _ ,_ = _weights(v_mat)
+    errors = Y_treated - weights.T.dot(Y_control)
+    ts_loss = opt.fun
+    ts_score = linalg.norm(errors) / sqrt(prod(errors.shape))
+
+    #if True:
+    #    _do_gradient_check()
+    if intercept: 
+        # not above, b/c Y_treated was already offset at the start
+        weights += 1/C 
+    return weights, v_mat, ts_score, ts_loss, L2_PEN_W, opt
+
+def ct_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True):
+    if treated_units is None: 
+        if control_units is None: 
+            raise ValueError("At least on of treated_units or control_units is required")
+        # Set the treated units to the not-control units
+        treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    if control_units is None: 
+        control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+
+    C, N, K = len(control_units), len(treated_units), X.shape[1]
+    X_treated = X[treated_units,:]
+    X_control = X[control_units,:]
+
+    A = X_control.dot(2*V).dot(X_control.T) + 2 * L2_PEN_W * diag(ones(X_control.shape[0])) # 5
+    B = X_treated.dot(2*V).dot(X_control.T).T # 6
+
+    weights = linalg.solve(A,B)
+    if intercept:
+        weights += 1/C
+    return weights.T
+
+def ct_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units = None,**kwargs):
+    if treated_units is None: 
+        if control_units is None: 
+            raise ValueError("At least on of treated_units or control_units is required")
+        # Set the treated units to the not-control units
+        treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    if control_units is None: 
+        control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+    weights = ct_weights(X = X, V = V, L2_PEN_W = L2_PEN_W, treated_units = treated_units, control_units = control_units,**kwargs)
+    Y_tr = Y[treated_units, :]
+    Y_c = Y[control_units, :]
+    Ey = (Y_tr - weights.dot(Y_c)).getA()
+    np.power(Y_tr - np.mean(Y_tr),2).sum()
+    return (Ey **2).sum() + LAMBDA * V.sum()
+
--- a/fit_loo.py
+++ b/fit_loo.py
@ -0,0 +1,366 @@
+from numpy import dot, ones, diag, matrix, zeros, array,absolute, mean,var, linalg, prod,shape,sqrt
+import numpy as np
+import pandas as pd
+import itertools
+import timeit
+import warnings
+from utils.sub_matrix_inverse import subinv_k, all_subinverses
+from .optimizers.cd_line_search import cdl_search
+warnings.filterwarnings('ignore')
+
+def loo_v_matrix(X,
+                 Y,
+                 LAMBDA = 0,
+                 treated_units = None,
+                 control_units = None,
+                 non_neg_weights = False,
+                 start = None,
+                 L2_PEN_W = None,
+                 method = cdl_search, 
+                 intercept = True,
+                 max_lambda = False,  # this is terrible at least without documentation...
+                 solve_method = "standard",
+                 **kwargs):
+    '''
+    Computes and sets the optimal v_matrix for the given moments and 
+        penalty parameter.
+
+    :param X: Matrix of Covariates
+    :param Y: Matrix of Outcomes
+    :param LAMBDA: penalty parameter used to shrink L1 norm of v/v.max() toward zero
+    :param treated_units: a list containing the position (rows) of the treated units within X and Y
+    :param control_units: a list containing the position (rows) of the control units within X and Y
+    :param start: initial values for the diagonals of the tensor matrix
+    :param L2_PEN_W: L2 penalty on the magnitude of the deviance of the weight vector from null. Optional.
+    :param method: The name of a method to be used by scipy.optimize.minimize, or a callable with the same API as scipy.optimize.minimize
+    :param intercept: If True, weights are penalized toward the 1 / the number of controls, else weights are penalized toward zero
+    :;aram max_lambda: if True, the return value is the maximum L1 penalty for which at least one element of the tensor matrix is non-zero
+    :;aram solve_method: Method for solving A.I.dot(B). Either "standard" or "step-down".
+    :param **kwargs: additional arguments passed to the optimizer
+
+    '''
+    # (by default all the units are treated and all are controls)
+    if treated_units is None: 
+        if control_units is None: 
+            # Neither provided; INCLUDE ALL SAMPLES AS BOTH TREAT AND CONTROL UNIT. 
+            # (this is the typical controls-only Loo V-matrix estimation)
+            control_units = list(range(X.shape[0]))
+            treated_units = control_units 
+        else:
+            # Set the treated units to the not-control units
+            treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    else:
+        if control_units is None: 
+            # Set the control units to the not-treated units
+            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+    control_units = np.array(control_units)
+    treated_units = np.array(treated_units)
+
+    # parameter QC
+    if not isinstance(X, matrix): raise TypeError("X is not a matrix")
+    if not isinstance(Y, matrix): raise TypeError("Y is not a matrix")
+    if X.shape[1] == 0: raise ValueError("X.shape[1] == 0")
+    if Y.shape[1] == 0: raise ValueError("Y.shape[1] == 0")
+    if X.shape[0] != Y.shape[0]: raise ValueError("X and Y have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+    if not isinstance(LAMBDA, (float, int)): raise TypeError( "LAMBDA is not a number")
+    if L2_PEN_W is None: L2_PEN_W = mean(var(X, axis = 0))
+    if not isinstance(L2_PEN_W, (float, int)): raise TypeError( "L2_PEN_W is not a number")
+    assert not non_neg_weights, "Bounds not implemented"
+
+    # CONSTANTS
+    C, N, K = len(control_units), len(treated_units), X.shape[1]
+    if start is None: start = zeros(K) # formerly: .1 * ones(K) 
+    assert N > 0; "No control units"
+    assert C > 0; "No treated units"
+    assert K > 0; "variables to fit (X.shape[1] == 0)"
+
+    # CREATE THE INDEX THAT INDICATES THE ELIGIBLE CONTROLS FOR EACH TREATED UNIT
+    in_controls = [list(set(control_units) - set([trt_unit])) for trt_unit in treated_units]
+    in_controls2 = [np.ix_(i,i) for i in in_controls] # this is a much faster alternative to A[:,index][index,:]
+    ctrl_rng = np.arange(len(control_units))
+    out_controls = [ctrl_rng[control_units != trt_unit] for trt_unit in treated_units] 
+    out_treated  = [ctrl_rng[control_units == trt_unit] for trt_unit in treated_units] # this is non-trivial when there control units are also being predicted.
+
+    if intercept:
+        for i, trt_unit in enumerate(treated_units):
+            Y[trt_unit,:] -= Y[in_controls[i],:].mean(axis=0) 
+
+    # handy constants (for speed purposes):
+    Y_treated = Y[treated_units,:]
+    Y_control = Y[control_units,:]
+    X_treated = X[treated_units,:]
+    X_control = X[control_units,:]
+
+    # INITIALIZE PARTIAL DERIVATIVES
+    dA_dV_ki = [ [None,] *N for i in range(K)]
+    dB_dV_ki = [ [None,] *N for i in range(K)]
+    b_i = [None,] *N 
+    for i, k in  itertools.product(range(N), range(K)): # TREATED unit i, moment k
+        Xc = X[in_controls[i], : ]
+        Xt = X[treated_units[i], : ]
+        dA_dV_ki [k][i] = Xc[:, k ].dot(Xc[:, k ].T) + Xc[:, k ].dot(Xc[:, k ].T) # 8
+        dB_dV_ki [k][i] = Xc[:, k ].dot(Xt[:, k ].T) + Xc[:, k ].dot(Xt[:, k ].T) # 9
+
+    del Xc, Xt, i, k
+
+        #assert (dA_dV_ki [k][i] == X[index, k ].dot(X[index, k ].T) + X[index, k ].dot(X[index, k ].T)).all()
+        # https://math.stackexchange.com/a/1471836/252693
+
+    def _score(V):
+        dv = diag(V)
+        weights, _, _ = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        return ((Ey **2).sum() + LAMBDA * absolute(V).sum()).copy() # (...).copy() assures that x.flags.writeable is True
+
+    def _grad(V):
+        """ Calculates just the diagonal of dGamma0_dV
+
+            There is an implementation that allows for all elements of V to be varied...
+        """
+        dv = diag(V)
+        weights, A, B = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        dGamma0_dV_term2 = zeros(K)
+        dPI_dV = zeros((C, N))
+        if solve_method == "step-down":
+            Ai_cache = all_subinverses(A)
+        for k in range(K):
+            dPI_dV.fill(0) # faster than re-allocating the memory each loop.
+            for i, index in enumerate(in_controls):
+                dA = dA_dV_ki[k][i]
+                dB = dB_dV_ki[k][i]
+                if solve_method == "step-down":
+                    b = Ai_cache[i].dot(dB - dA.dot(b_i[i]))
+                else:
+                    b = linalg.solve(A[in_controls2[i]],dB - dA.dot(b_i[i]))
+                dPI_dV[index, i] = b.flatten() # TODO: is the Transpose  an error???
+            dGamma0_dV_term2[k] = (Ey * Y_control.T.dot(dPI_dV).T.getA()).sum()
+        return LAMBDA - 2 * dGamma0_dV_term2 # (...).copy() assures that x.flags.writeable is True // * (1 - 2*(V < 0)) was droped b/c we're restricting to positive V's
+
+    L2_PEN_W_mat = 2 * L2_PEN_W * diag(ones(X_control.shape[0]))
+    def _weights(V):
+        weights = zeros((C, N))
+        if solve_method == "step-down":
+            A = X_control.dot(V + V.T).dot(X_control.T) + 2 * L2_PEN_W * diag(ones(X_control.shape[0])) # 5
+            B = X_treated.dot(V + V.T).dot(X_control.T) # 6
+            Ai = A.I
+            for i, trt_unit in enumerate(treated_units):
+                if trt_unit in control_units:
+                    (b) = subinv_k(Ai,_k).dot(B[out_controls[i],i])
+                else:
+                    (b) = Ai.dot(B[:, i])
+                b_i[i] = b
+                weights[out_controls[i], i] = b.flatten()
+        elif solve_method == "standard":
+            A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
+            B = X.dot(V + V.T).dot(X.T).T # 6
+            for i, trt_unit in enumerate(treated_units):
+                (b) = b_i[i] = linalg.solve(A[in_controls2[i]], B[in_controls[i], trt_unit])
+                weights[out_controls[i], i] = b.flatten()
+        else:
+            raise ValueError("Unknown Solve Method: " + solve_method)
+        return weights, A, B
+
+    if max_lambda:
+        grad0 = _grad(zeros(K))
+        return -grad0[grad0 < 0].min()
+
+    # DO THE OPTIMIZATION
+    if isinstance(method, str):
+        from scipy.optimize import minimize
+        opt = minimize(_score, start.copy(), jac = _grad, method = method, **kwargs)
+    else:
+        assert callable(method), "Method must be a valid method name for scipy.optimize.minimize or a minimizer"
+        opt = method(_score, start.copy(), jac = _grad, **kwargs)
+    v_mat = diag(opt.x)
+    # CALCULATE weights AND ts_score
+    weights, _, _ = _weights(v_mat)
+    errors = Y_treated - weights.T.dot(Y_control)
+    ts_loss = opt.fun
+    ts_score = linalg.norm(errors) / sqrt(prod(errors.shape))
+
+    #if True:
+    #    _do_gradient_check()
+
+    if intercept:
+        for i, trt_unit in enumerate(treated_units):
+            weights[out_controls[i], i] += 1/len(out_controls[i])
+    return weights, v_mat, ts_score, ts_loss, L2_PEN_W, opt
+
+def loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True, solve_method = "standard"):
+    if treated_units is None: 
+        if control_units is None: 
+            # both not provided, include all samples as both treat and control unit.
+            control_units = list(range(X.shape[0]))
+            treated_units = control_units 
+        else:
+            # Set the treated units to the not-control units
+            treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    else:
+        if control_units is None: 
+            # Set the control units to the not-treated units
+            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+    control_units = np.array(control_units)
+    treated_units = np.array(treated_units)
+    [C, N, K] = [len(control_units), len(treated_units), X.shape[1]]
+
+
+    # index with positions of the controls relative to the incoming data
+    in_controls = [list(set(control_units) - set([trt_unit])) for trt_unit in treated_units]
+    in_controls2 = [np.ix_(i,i) for i in in_controls] # this is a much faster alternative to A[:,index][index,:]
+
+    # index of the controls relative to the rows of the outgoing C x N matrix of weights
+    ctrl_rng = np.arange(len(control_units))
+    out_controls = [ctrl_rng[control_units != trt_unit] for trt_unit in treated_units] 
+    out_treated  = [ctrl_rng[control_units == trt_unit] for trt_unit in treated_units] # this is non-trivial when there control units are also being predicted.
+
+    # constants for indexing
+    X_control = X[control_units,:]
+    X_treat = X[treated_units,:]
+    weights = zeros((C, N))
+
+    if solve_method == "step-down":
+        A = X_control.dot(V + V.T).dot(X_control.T) + 2 * L2_PEN_W * diag(ones(X_control.shape[0])) # 5
+        B = X_treat.dot(  V + V.T).dot(X_control.T) # 6
+        Ai = A.I
+        for i, trt_unit in enumerate(treated_units):
+            if trt_unit in control_units:
+                (b) = subinv_k(Ai,_k).dot(B[out_controls[i],i])
+            else:
+                (b) = Ai.dot(B[:, i])
+
+            weights[out_controls[i], i] = b.flatten()
+            if intercept:
+                weights[out_controls[i], i] += 1/len(out_controls[i])
+    elif solve_method == "standard":
+        A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
+        B = X.dot(V + V.T).dot(X.T).T # 6
+        for i, trt_unit in enumerate(treated_units):
+            (b) = linalg.solve(A[in_controls2[i]], B[in_controls[i], trt_unit])
+
+            weights[out_controls[i], i] = b.flatten()
+            if intercept:
+                weights[out_controls[i], i] += 1/len(out_controls[i])
+    else:
+        raise ValueError("Unknown Solve Method: " + solve_method)
+    return weights.T
+
+
+
+def __old__loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True):
+    assert False
+    if treated_units is None: 
+        if control_units is None: 
+            # both not provided, include all samples as both treat and control unit.
+            control_units = list(range(X.shape[0]))
+            treated_units = control_units 
+        else:
+            # Set the treated units to the not-control units
+            treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    else:
+        if control_units is None: 
+            # Set the control units to the not-treated units
+            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+    [C, N, K] = [len(control_units), len(treated_units), X.shape[1]]
+    index_i = [list(set(list(range(200))) - set([col])) for col in treated_units]
+
+    index_i2 = [np.ix_(i,i) for i in index_i] # this is a much faster alternative to A[:,index][index,:]
+    index_map = {c_unit : row_at for row_at, c_unit in enumerate(control_units)}
+
+    weights = zeros((C, N))
+    A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
+    B = X.dot(V + V.T).dot(X.T).T # 6
+    for col, index in enumerate(index_i):
+        b = linalg.solve(A[index_i2[col]], B[index, col])
+        weights[np.vectorize(index_map.get)(index), col] = b.T
+        if intercept:
+            weights[index, col] += 1/len(index)
+    return weights
+
+
+
+
+def loo_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units = None,**kwargs):
+    if treated_units is None: 
+        if control_units is None: 
+            # both not provided, include all samples as both treat and control unit.
+            control_units = list(range(X.shape[0]))
+            treated_units = control_units 
+        else:
+            # Set the treated units to the not-control units
+            treated_units = list(set(range(X.shape[0])) - set(control_units))  
+    else:
+        if control_units is None: 
+            # Set the control units to the not-treated units
+            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
+    weights = loo_weights(X = X,
+                           V = V,
+                           L2_PEN_W = L2_PEN_W,
+                           treated_units = treated_units,
+                           control_units = control_units,
+                           **kwargs)
+    Y_tr = Y[treated_units, :]
+    Y_c = Y[control_units, :]
+    Ey = (Y_tr - weights.dot(Y_c)).getA()
+    return (Ey **2).sum() + LAMBDA * V.sum()
+
+
+
+def fold_v_matrix(Xt,Xc,Yt,Yc,LAMBDA = .1,L2_PEN_W = .1,guess = None,method = 'Newton-CG',**kwargs):
+    """ Calculates the V matrix with an explicit Treatment and Control folds,
+        where the match is optimized on the error of the outcomes.
+
+        don't use this. it hasn't been thoroughly been checked for bugs (compared
+        to loo_v_matrix and ct_v_matrix) and lacks several of the optimizations of loo_v_matrix
+    """
+    C,K = Xc.shape; N,T = Yt.shape
+    two_L2_PEN_W = 0#2*L2_PEN_W*diag(ones(C))
+    def _score(diagV):
+        """ calculates just the diagonal of dGamma0_dV
+        """
+        V = diag(diagV)
+        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
+        B = Xc.dot(2*V).dot(Xt.T) # 6
+        Pi = A.I.dot(B) 
+        Ey = (Yt - Pi.T.dot(Yc)).getA()
+        return (Ey **2).sum() + LAMBDA * absolute(Pi.T).sum()
+    def _grad(diagV):
+        """ calculates just the diagonal of dGamma0_dV
+        """
+        V = diag(diagV)
+        assert Xc.shape[0] == Yc.shape[0]
+        # CALCULATE CONSTANTS
+        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
+        B = Xc.dot(2*V).dot(Xt.T) # 6
+        Ai = A.I
+        Pi = Ai.dot(B) 
+        Ey = (Yt - Pi.T.dot(Yc)).getA()
+        # INITIALIZE PARTIAL DERIVATIVES
+        dGamma0_dV_term2 = zeros((K,K))
+        for i in range(K):
+            dA = Xc[:, i ].dot(Xc[:, i ].T) + Xc[:, i ].dot(Xc[:, i ].T) # 8
+            dB = Xc[:, i ].dot(Xt [:, i ].T) + Xc[:, i ].dot(Xt [:, i ].T) # 9
+            dPI_dV = Ai.dot(dB - dA.dot(Pi)) # 7
+            dGamma0_dV_term2[i,i] = (Ey * Yc.T.dot(dPI_dV).T.getA()).sum()
+        return diag(LAMBDA * (1 - 2*(V<0)) - 2 * dGamma0_dV_term2)
+    def _weights(V):
+        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
+        B = Xc.dot(2*V).dot(Xt.T) # 6
+        #for col in treated_unit:
+            #w_col = A[-col,-col].I.dot(B[:,col])
+        return A.I.dot(B).T
+    # DO THE OPTIMIZATION
+    opt = minimize(_score, zeros(Xc.shape[1]) if guess is None else guess,
+                   jac = _grad,
+                   method = method, **kwargs)
+    v_mat = diag(opt.x)
+    ts_loss = opt.fun
+    # CALCULATE weights AND ts_score
+    weights = _weights(v_mat)
+    errors = Yt - weights.dot(Yc)
+    ts_score = linalg.norm(errors) / sqrt(prod(errors.shape))
+    return weights, v_mat, ts_score, ts_loss, opt
+
+
+
+
--- a/lambda_utils.py
+++ b/lambda_utils.py
@ -0,0 +1,79 @@
+
+from fit_loo import  loo_v_matrix
+from fit_ct import  ct_v_matrix
+from optimizers.cd_line_search import cdl_search
+import numpy as np
+
+def L2_pen_guestimate(X):
+    return np.mean(np.var(X, axis = 0))
+
+def get_max_lambda(X,Y,L2_PEN_W=None,X_treat=None,Y_treat=None,**kwargs):
+    """ returns the maximum value of the L1 penalty for which the elements of tensor matrix (V) are not all zero.
+    """
+
+    # PARAMETER QC
+    if not isinstance(X, np.matrix): raise TypeError("X is not a matrix")
+    if not isinstance(Y, np.matrix): raise TypeError("Y is not a matrix")
+    if X_treat is None != Y_treat is None:
+        raise ValueError("parameters `X_treat` and `Y_treat` must both be Matrices or None")
+    if X.shape[1] == 0: raise ValueError("X.shape[1] == 0")
+    if Y.shape[1] == 0: raise ValueError("Y.shape[1] == 0")
+    if X.shape[0] != Y.shape[0]: raise ValueError("X and Y have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+    if L2_PEN_W is None: L2_PEN_W = np.mean(np.var(X, axis = 0))
+
+    if X_treat is not None:
+
+        # PARAMETER QC
+        if not isinstance(X_treat, np.matrix): raise TypeError("X_treat is not a matrix")
+        if not isinstance(Y_treat, np.matrix): raise TypeError("Y_treat is not a matrix")
+        if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+        if Y_treat.shape[1] == 0: raise ValueError("Y_treat.shape[1] == 0")
+        if X_treat.shape[0] != Y_treat.shape[0]: 
+            raise ValueError("X_treat and Y_treat have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+        control_units = np.arange(X.shape[0])
+        treated_units = np.arange(X.shape[0],X.shape[0] + X_treat.shape[0])
+
+        try:
+            _LAMBDA = iter(L2_PEN_W)
+        except TypeError:
+            # L2_PEN_W is a single value 
+            return ct_v_matrix(X = np.vstack((X,X_treat)),
+                               Y = np.vstack((Y,Y_treat)),
+                               L2_PEN_W = L2_PEN_W,
+                               control_units = control_units,
+                               treated_units = treated_units,
+                               max_lambda = True,  # this is terrible at least without documentation...
+                               **kwargs)
+            
+        else:
+            # L2_PEN_W is an iterable of values
+            return [ ct_v_matrix(X = np.vstack((X,X_treat)),
+                                 Y = np.vstack((Y,Y_treat)),
+                                  control_units = control_units,
+                                  treated_units = treated_units,
+                                  max_lambda = True,  # this is terrible at least without documentation...
+                                  L2_PEN_W = l2_pen,
+                                  **kwargs)
+                        for l2_pen in L2_PEN_W ] 
+
+    else: 
+
+        try:
+            _LAMBDA = iter(L2_PEN_W)
+        except TypeError:
+            # L2_PEN_W is a single value 
+            return loo_v_matrix(X = X,
+                               Y = Y,
+                               L2_PEN_W = L2_PEN_W,
+                               max_lambda = True,  # this is terrible at least without documentation...
+                               **kwargs)
+        else:
+            # L2_PEN_W is an iterable of values
+            return [ loo_v_matrix(X = X,
+                                   Y = Y,
+                                   L2_PEN_W = l2_pen,
+                                   max_lambda = True,  # this is terrible at least without documentation...
+                                   **kwargs)
+                        for l2_pen in L2_PEN_W ] 
+
--- a/optimizers/init.py
+++ b/optimizers/init.py
--- a/optimizers/cd_line_search.py
+++ b/optimizers/cd_line_search.py
@ -0,0 +1,150 @@
+import numpy as np
+from scipy.optimize import line_search
+
+import locale 
+locale.setlocale(locale.LC_ALL, '')
+
+class cd_res:
+    def __init__(self, x, fun):
+        self.x = x
+        self.fun = fun
+
+print_stop_iteration = 0
+
+def cdl_search(score,
+              guess,
+              jac,
+              tol = 1e-4,
+              agressiveness = 0.01,# agressiveness
+              alpha_mult = .9,
+              max_iter = 3000,
+              min_iter = 3,
+              # TODO: this is a stupid default (I'm useing it out of lazyness)
+              zero_eps = 1e2 * np.finfo(float).eps,
+              simplex=False,
+              stol = None,
+              print_path=False,
+              print_path_verbose = False):
+    '''
+    Implements coordinate descent with line search with the strong wolf
+    conditions. Note, this tends to give nearly identical results as L-BFGS-B,
+    and is *much* slower than that the super-fast 40 year old fortran code
+    wrapped by scipy.
+    '''
+    assert 0 < agressiveness < 1
+    assert 0 < alpha_mult < 1
+    assert (guess >=0).all(), "Initial guess (`guess`) should be in the closed positive orthant"
+    x_curr = guess
+    alpha_t = 1.
+    zero_score = val = score(x_curr)
+
+    for _i in range(max_iter):
+
+        grad = jac(x_curr)
+        invalid_directions = np.logical_and(grad > 0,x_curr == 0)
+
+        if (grad[np.logical_not(invalid_directions)] == 0).all():
+            # this happens when we're stuck at the origin and the gradient is
+            # pointing in the all-negative direction
+            if print_stop_iteration: print("[gradient is zero] i: %s" % (_i,))
+            return cd_res(x_curr, val)
+
+        # Expected Loss Reduction for a unit step in each of the valid directions
+        elr = sum( abs(g) for g in grad[np.logical_not(invalid_directions)])
+        naiive_step_size_multiplier = agressiveness*val/elr
+        grad *= naiive_step_size_multiplier * alpha_t
+
+        # constrain the gradient to being non-negative on axis where the
+        # current guess is already zero
+
+        # constrain to the positive orthant
+        grad[invalid_directions] = 0
+
+        if (grad>0).any():
+            constrained = True
+            alpha_ratios = grad[ grad >0 ] / x_curr[ grad >0 ]
+            if (alpha_ratios > 1).any: 
+                max_alpha = alpha_ratios.max()
+            else:
+                max_alpha = 1
+        else:
+            constrained = False
+            max_alpha = 1
+
+        res = line_search(f=score, myfprime=jac, xk=x_curr, pk=-grad/max_alpha) # 
+        alpha, _, _, possibly_wrong_new_val, _, _ = res 
+        if alpha is not None:
+            if alpha > 1:
+                alpha_t /= alpha_mult
+            else:
+                alpha_t *= alpha_mult
+        elif constrained:
+            for j in range(5): # formerly range(17), but that was excessive, in general, this succeeds happens when alpha >= 0.1 (super helpful) or alpha <= 1e-14 (super useless)
+                if score(x_curr - (.3**j)*grad/max_alpha) < val:
+                    # This can occur when the strong wolf condition insists that the
+                    # current step size is too small (i.e. the gradient is too
+                    # consistent with the function to think that a small step is 
+                    # optimal for a global (unconstrained) optimization.
+                    alpha = (.3**j)
+
+                    # i secretly think this is stupid.
+                    if print_stop_iteration: print("[simple line search worked :)] i: %s, alpha: 1e-%s" % (_i,j))
+                    break
+            else:
+                # moving in the direction of the gradient yielded no improvement: stop
+                if print_stop_iteration: print("[simple line search failed] i: %s" % (_i,))
+                return cd_res(x_curr, val)
+        else:
+            # moving in the direction of the gradient yielded no improvement: stop
+            if print_stop_iteration: print("[alpha is None] i: %s, grad: %s" % (_i, grad/max_alpha, ))
+            return cd_res(x_curr, val)
+
+        # iterate
+        try:
+            if constrained:
+                x_old, x_curr, val_old   =   x_curr, x_curr - min(1, alpha)*grad/max_alpha, val
+            else:
+                x_old, x_curr, val_old   =   x_curr, x_curr -      alpha*grad/max_alpha, val
+        except:
+            import pdb; pdb.set_trace() 
+
+        val = score(x_curr)
+        val_diff = val_old - val
+
+        if print_path: 
+            try:
+                #if constrained:
+                print("i: %s, val: %s, val_diff: %0.2f, alpha: %0.5f, zeros: %s (constrained)"  % (_i, locale.format("%d", val, grouping=True), val_diff, alpha, sum( x_curr == 0)))
+                #else:
+                    #print("i: %s, val: %s, val_diff: %0.2f, alpha: %0.5f,ELR: %0.6f,  multiplier: %0.6f, aggressiveness: %0.6f, constrained: %s, zeros: %s"  % (_i, locale.format("%d", val, grouping=True), val_diff, alpha, elr, naiive_step_size_multiplier * alpha_t, agressiveness*alpha_t ,constrained,sum( x_curr == 0)))
+                if print_path_verbose:
+                    print("grad: %s,x_curr %s"  % (grad, x_curr, ))
+            except Exception as err:
+                print(err)
+                import pdb; pdb.set_trace() 
+
+        # rounding error can get us really close or even across the coordinate plane.
+        x_curr[abs(x_curr) < zero_eps] = 0
+        x_curr[x_curr < zero_eps] = 0
+
+        if (x_curr == 0).all() and (x_old == 0).all():
+            # this happens when we were at the origin and the gradient didn't
+            # take us out of the range of zero_eps
+            if print_stop_iteration: print("stuck at the origin %s"% (_i,))
+            return cd_res(x_curr, score(x_curr)) # tricky tricky...
+
+        if (x_curr < 0).any():
+            import pdb; pdb.set_trace() 
+            print("This shouldn't ever happen if max_alpha is specified properly")
+
+        if val_diff/val < tol:
+            # this a heuristic rule, to be sure, but seems to be useful.
+            if _i > min_iter:
+                if print_stop_iteration: print("[val_diff/val < tol] i: %s, val: %s, val_diff: %s" % (_i, val, val_diff, ))
+                return cd_res(x_curr, val)
+    else:
+        raise RuntimeError('Solution did not converge to default tolerance')
+    
+
+
+
--- a/tensor.py
+++ b/tensor.py
@ -0,0 +1,48 @@
+from fit_loo import loo_v_matrix
+from fit_ct import ct_v_matrix
+import numpy as np
+
+def tensor(X, Y, X_treat=None, Y_treat=None, **kwargs):
+    """ Presents a unified api for ct_v_matrix and loo_v_matrix
+    """
+    # PARAMETER QC
+    if not isinstance(X, np.matrix): raise TypeError("X is not a matrix")
+    if not isinstance(Y, np.matrix): raise TypeError("Y is not a matrix")
+    if X.shape[1] == 0: raise ValueError("X.shape[1] == 0")
+    if Y.shape[1] == 0: raise ValueError("Y.shape[1] == 0")
+    if X.shape[0] != Y.shape[0]: raise ValueError("X and Y have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+    if X_treat is None != Y_treat is None: 
+        raise ValueError("parameters `X_treat` and `Y_treat` must both be Matrices or None")
+
+    if X_treat is not None:
+        "Fit the Treated units to the control units; assuming that Y contains pre-intervention outcomes"
+
+        # PARAMETER QC
+        if not isinstance(X_treat, np.matrix): raise TypeError("X_treat is not a matrix")
+        if not isinstance(Y_treat, np.matrix): raise TypeError("Y_treat is not a matrix")
+        if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+        if Y_treat.shape[1] == 0: raise ValueError("Y_treat.shape[1] == 0")
+        if X_treat.shape[0] != Y_treat.shape[0]: raise ValueError("X_treat and Y_treat have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))
+
+        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+        # note that the weights, score, and loss function value returned here are for the in-sample predictions
+        weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                    ct_v_matrix(X = np.vstack((X,X_treat)),
+                                Y = np.vstack((Y,Y_treat)),
+                                control_units = np.arange(X.shape[0]),
+                                treated_units = np.arange(X_treat.shape[0]) + X.shape[0],
+                                **kwargs)
+
+    else: 
+        "Fit the control units to themselves; Y may contain post-intervention outcomes"
+
+        weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                loo_v_matrix(X = X,
+                             Y = Y, 
+                             control_units = np.arange(X.shape[0]),
+                             treated_units = np.arange(X.shape[0]),
+                             # treated_units = [X.shape[0] + i for i in  range(len(train))],
+                             **kwargs)
+    return v_mat
+
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/sub_matrix_inverse.py
+++ b/utils/sub_matrix_inverse.py
@ -0,0 +1,168 @@
+""" In the leave-one-out method with larger sample sizes most of the time is spent in calculating 
+    A.I.dot(B) using np.linalg.solve (which is much faster than the brute force method i.e. `A.I.dot(B)`). 
+    For example, with 200 units, about about 95% of the time is spent in this line.  
+
+    However we can take advantage of the fact that we're calculating the
+    inverse of N matrices which are all sub-matrices of a common matrix by inverting the common matrix 
+    and calculating each subset inverse from there.
+
+    https://math.stackexchange.com/a/208021/252693
+
+
+    TODO: this could be made a bit faster by passing in the indexes (k_rng,
+          k_rng2) instead of re-building them
+"""
+import numpy as np
+
+#-- n = 5
+#-- p = 3
+#-- a = np.matrix(np.random.random((n,p,)))
+#-- v = np.diag(np.random.random((p,)))
+#-- x = a.dot(v).dot(a.T)
+#-- x.dot(x.I)
+#-- 
+#-- x = np.matrix(np.random.random((n,n,)))
+#-- x.dot(x.I)
+#-- 
+#-- xi = x.I
+#-- 
+#-- 
+#-- B = np.matrix(np.random.random((n,n,)))
+#-- 
+#-- k = np.arange(2)
+#-- N = xi.shape[0]
+#-- rng = np.arange(N)
+#-- k_rng = rng[np.logical_not(np.isin(rng,k))]
+#-- 
+#-- out = xi[np.ix_(k_rng,k_rng)] - xi[np.ix_(k_rng,k)].dot(xi[np.ix_(k,k_rng)])/np.linalg.det(xi[np.ix_(k,k)])
+#-- 
+#-- jjkkjtu = x[np.ix_(k_rng,k_rng)].I
+
+def all_subinverses(x,eps=None):
+    """ Given an matrix (x), calculate all the inverses of leave-one-out sub-matrices. 
+    param: x a square matrix for which to find the inverses of all it's leave one out sub-matrices.
+    param: eps If not None, used to assert that the each calculated
+           sub-matrix-inverse is within eps of the brute force calculation.
+           Testing only, this slows the process way down since the inverse of
+           each sub-matrix is calculated by the brute force method. Typically
+           set to a multiple of `np.finfo(float).eps`
+    """
+    # handy constant for indexing
+    xi = x.I
+    N = x.shape[0]
+    rng = np.arange(N)
+    out = [None,] * N
+    for k in range(N):
+        k_rng = rng[rng != k]
+        out[k] = xi[np.ix_(k_rng,k_rng)] - xi[k_rng,k].dot(xi[k,k_rng])/xi[k,k]
+        if eps is not None:
+            if not (abs(out[k] - x[np.ix_(k_rng,k_rng)].I) < eps).all():
+                raise RuntimeError("Fast and brute force methods were not within epsilon (%s) for sub-matrix k = %s; max difference = %s" % (eps, k,  abs(xi_k - x[k_rng2].I).max(), ) )
+    return out
+
+def subinv_k(xi,k,eps=None):
+    """ Given an matrix (x), calculate all the inverses of leave-one-out sub-matrices. 
+    param: x a square matrix for which to find the inverses of all it's leave one out sub-matrices.
+    param: k the column and row to leave out
+    param: eps If not None, used to assert that the each calculated
+           sub-matrix-inverse is within eps of the brute force calculation.
+           Testing only, this slows the process way down since the inverse of
+           each sub-matrix is calculated by the brute force method. Typically
+           set to a multiple of `np.finfo(float).eps`
+    """
+    # handy constant for indexing
+    N = xi.shape[0]
+    rng = np.arange(N)
+    k_rng = rng[rng != k]
+    out = xi[np.ix_(k_rng,k_rng)] - xi[k_rng,k].dot(xi[k,k_rng])/xi[k,k]
+    if eps is not None:
+        if not (abs(out[k] - x[np.ix_(k_rng,k_rng)].I) < eps).all():
+            raise RuntimeError("Fast and brute force methods were not within epsilon (%s) for sub-matrix k = %s; max difference = %s" % (eps, k,  abs(xi_k - x[k_rng2].I).max(), ) )
+    return out
+
+
+
+# ---------------------------------------------
+# single sub-matrix
+# ---------------------------------------------
+
+if __name__ == "__main__":
+
+    import time
+
+    n = 200
+    B = np.matrix(np.random.random((n,n,)))
+
+    for i in range(100):
+        # create a sub-matrix that meets the matching criteria
+        x = np.matrix(np.random.random((n,n,)))
+        try:
+            zz = subinv(x,10e-10)
+            break
+        except:
+            pass
+    else:
+        print("Failed to generate a %sx%s matrix whose inverses are all within %s of the quick method")
+
+
+    k = 5
+    n_tests = 1000
+
+    # =======================
+    t0 = time.time()
+    for i in range(n_tests): 
+        N = xi.shape[0]
+        rng = np.arange(N)
+        k_rng = rng[rng != k]
+        k_rng2 = np.ix_(k_rng,k_rng)
+        zz = x[k_rng2].I.dot(B[k_rng2])
+
+    t1 = time.time()
+    slow_time = t1 - t0
+    print("A.I.dot(B): brute force time (N = %s): %s"% (n,t1 - t0))
+
+    # =======================
+    t0 = time.time()
+    for i in range(n_tests): 
+        # make the comparison fair
+        if i % n == 0:
+            xi = x.I
+        zz = subinv_k(xi,k).dot(B[k_rng2])
+
+    t1 = time.time()
+    fast_time = t1 - t0
+    print("A.I.dot(B): quick time (N = %s): %s"% (n,t1 - t0))
+
+    # =======================
+    t0 = time.time()
+    for i in range(n_tests): 
+        N = xi.shape[0]
+        rng = np.arange(N)
+        k_rng = rng[rng != k]
+        k_rng2 = np.ix_(k_rng,k_rng)
+        zz = np.linalg.solve(x[k_rng2],B[k_rng2])
+
+    t1 = time.time()
+    fast_time = t1 - t0
+    print("A.I.dot(B): np.linalg.solve time (N = %s): %s"% (n,t1 - t0))
+
+    # ---------------------------------------------
+    # ---------------------------------------------
+
+    t0 = time.time()
+    for i in range(100): zz = subinv(x,10e-10)
+
+    t1 = time.time()
+    slow_time = t1 - t0
+    print("Full set of inverses: brute force time (N = %s): %s", (n,t1 - t0))
+
+    t0 = time.time()
+    for i in range(100): zz = subinv(x)
+
+    t1 = time.time()
+    fast_time = t1 - t0
+    print("Full set of inverses: quick time (N = %s): %s", (n,t1 - t0))
+
+    # ---------------------------------------------
+    # ---------------------------------------------
+
--- a/weights.py
+++ b/weights.py
@ -0,0 +1,35 @@
+from fit_loo import loo_weights
+from fit_ct import ct_weights
+import numpy as np
+
+def weights(X, X_treat=None, **kwargs):
+    """ Calculate synthetic control weights
+    """
+
+    # PARAMETER QC
+    if not isinstance(X, np.matrix): raise TypeError("X is not a matrix")
+    if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+
+    if X_treat is not None:
+        "weight for the control units against the remaining controls" 
+
+        # PARAMETER QC
+        if not isinstance(X_treat, np.matrix): raise TypeError("X_treat is not a matrix")
+        if X_treat.shape[1] == 0: raise ValueError("X_treat.shape[1] == 0")
+
+        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+        # note that the weights, score, and loss function value returned here are for the in-sample predictions
+        return ct_weights(X = np.vstack((X,X_treat)),
+                           control_units = np.arange(X.shape[0]),
+                           treated_units = np.arange(X_treat.shape[0]) + X.shape[0],
+                           **kwargs)
+
+    else: 
+        "weight for the control units against the remaining controls"
+        return loo_weights(X = X,
+                            control_units = np.arange(X.shape[0]),
+                            treated_units = np.arange(X.shape[0]),
+                            **kwargs)
+
+    return weights
+