fold_v_matrix now works.

2018-05-21 15:42:54 -07:00 · 2018-05-21 15:42:54 -07:00 · 93588155bf
--- a/README.md
+++ b/README.md
@ -13,3 +13,19 @@ Scipy 1.0.1 )
 For now, refer to the example code in `./example-code.py` for usage
 details.

+### Performance Notes
+
+For larger sample sizes (number of controls), almost all of the running
+time is spent calculating `np.linalg.solve(A,B)` where A is a `C x C`
+matrix and `B` is a `C x T` matrix, where `C` is the number of control
+units and `T` is the number of treated units.   Because the LAPACK library
+is already parallelized, passing `parallel = true` to the `CV_score`
+function (which runs each fold in a separate sub-process) will actually
+tend to increase the running time. 
+
+This is especially true for the leave-one-out gradient descent using only
+controls as `np.linalg.solve(A,B)` will be called twice for each control
+unit.  With larger sample sizes, it is more efficient to pass a value to
+`grad_splits`, in order to use k-fold gradient descent, in which case
+`np.linalg.solve(A,B)` will be called twice for each split.
+
--- a/cross_validation.py
+++ b/cross_validation.py
@ -1,8 +1,10 @@

+from fit_fold import  fold_v_matrix#, fold_score
 from fit_loo import  loo_v_matrix, loo_score
 from fit_ct import  ct_v_matrix, ct_score
 from optimizers.cd_line_search import cdl_search
 import numpy as np
+import itertools
 from concurrent import futures

 def score_train_test(X, 
@ -11,7 +13,8 @@ def score_train_test(X,
                      test,
                      X_treat=None,
                      Y_treat=None,
-                      FoldNumber=None, # for consistency with score_train_test_sorted_lambdas()
+                      FoldNumber=None, # For consistency with score_train_test_sorted_lambdas()
+                      grad_splits=None, #  If present, use  k fold gradient descent. See fold_v_matrix for details
                      **kwargs):
    """ presents a unified api for ct_v_matrix and loo_v_matrix
        and returns the v_mat, l2_pen_w (possibly calculated, possibly a parameter), and the score 
@ -59,14 +62,27 @@ def score_train_test(X,
    else: 
        ">> K-fold validation on the only control units; assuming that Y contains post-intervention outcomes"

-        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
-        # note that the weights, score, and loss function value returned here are for the in-sample predictions
-        weights, v_mat, ts_score, loss, l2_pen_w, _ = \
-                loo_v_matrix(X = X[train, :],
-                             Y = Y[train, :], 
-                             # treated_units = [X.shape[0] + i for i in  range(len(train))],
-                             method = cdl_search,
-                             **kwargs)
+        if grad_splits is not None:
+
+            # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+            # note that the weights, score, and loss function value returned here are for the in-sample predictions
+            weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                    fold_v_matrix(X = X[train, :],
+                                 Y = Y[train, :], 
+                                 # treated_units = [X.shape[0] + i for i in  range(len(train))],
+                                 method = cdl_search,
+                                 **kwargs)
+
+        else:
+
+            # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE L2_PEN_W
+            # note that the weights, score, and loss function value returned here are for the in-sample predictions
+            weights, v_mat, ts_score, loss, l2_pen_w, _ = \
+                    loo_v_matrix(X = X[train, :],
+                                 Y = Y[train, :], 
+                                 # treated_units = [X.shape[0] + i for i in  range(len(train))],
+                                 method = cdl_search,
+                                 **kwargs)

        # GET THE OUT-OF-SAMPLE PREDICTION ERROR
        s = loo_score(X = X, Y = Y, 
@ -149,17 +165,20 @@ def CV_score(X,Y,LAMBDA,X_treat=None,Y_treat=None,splits=5,sub_splits=None,quiet
        if X_treat.shape[0] != Y_treat.shape[0]: 
            raise ValueError("X_treat and Y_treat have different number of rows (%s and %s)" % (X.shape[0], Y.shape[0],))

-        # MESSAGING
-        if not quiet: 
-            print("K-fold validation with %s control and %s treated units %s predictors and %s outcomes, holding out one fold among Treated units; Assumes that `Y` and `Y_treat` are pre-intervention outcomes" % 
-                    (X.shape[0] , X_treat.shape[0],X.shape[1],Y.shape[1],))

-        if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
+
+        try:
+            iter(splits)
+        except TypeError: 
            from sklearn.model_selection import KFold
            splits = KFold(splits).split(np.arange(X_treat.shape[0]))
        train_test_splits = [ x for x in splits ]
        n_splits = len(train_test_splits)

+        # MESSAGING
+        if not quiet: 
+            print("%s-fold validation with %s control and %s treated units %s predictors and %s outcomes, holding out one fold among Treated units; Assumes that `Y` and `Y_treat` are pre-intervention outcomes" % 
+                    (n_splits, X.shape[0] , X_treat.shape[0],X.shape[1],Y.shape[1],))

        if parallel: 

@ -209,17 +228,20 @@ def CV_score(X,Y,LAMBDA,X_treat=None,Y_treat=None,splits=5,sub_splits=None,quiet


    else: # X_treat *is* None
-        # MESSAGING
-        if not quiet: 
-            print("Leave-one-out Cross Validation with %s control units, %s predictors and %s outcomes; Y may contain post-intervention outcomes" % 
-                    (X.shape[0],X.shape[1],Y.shape[1],) )

-        if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
+        try:
+            iter(splits)
+        except TypeError: 
            from sklearn.model_selection import KFold
            splits = KFold(splits).split(np.arange(X.shape[0]))
        train_test_splits = [ x for x in splits ]
        n_splits = len(train_test_splits)

+        # MESSAGING
+        if not quiet: 
+            print("%s-fold Cross Validation with %s control units, %s predictors and %s outcomes; Y may contain post-intervention outcomes" % 
+                    (n_splits, X.shape[0],X.shape[1],Y.shape[1],) )
+
        if parallel: 

            if max_workers is None:
--- a/example-code.py
+++ b/example-code.py
@ -20,8 +20,12 @@ import random
 if __name__ == "__main__":

    # ------------------------------------------------------------
-    # CONTROL PARAMETERS
    # ------------------------------------------------------------
+    # SECTION 1: GENERATE SOME TOY DATA
+    # ------------------------------------------------------------
+    # ------------------------------------------------------------
+
+    # CONTROL PARAMETERS

    random.seed(12345)
    np.random.seed(10101)
@ -36,9 +40,6 @@ if __name__ == "__main__":
    beta_scale,confounders_scale = 4,1

    # ------------------------------------------------------------
-    # SIMULATE SOME DATA
-    # ------------------------------------------------------------
-

    # COVARIATE EFFECTS
    X_control = np.matrix(np.random.normal(0,1,((C)*groups, K+S+R)))
@ -120,9 +121,11 @@ if __name__ == "__main__":
    # control_units = np.arange( C * groups )
    # treated_units = np.arange( N * groups ) + C

+    # ------------------------------------------------------------
    # ------------------------------------------------------------
    # find default penalties
    # ------------------------------------------------------------
+    # ------------------------------------------------------------

    # get starting point for the L2 penalty 
    L2_pen_start_ct  = SC.L2_pen_guestimate(X_control)
@ -130,8 +133,11 @@ if __name__ == "__main__":

    # get the maximum value for the L1 Penalty parameter conditional on the guestimate for the L2 penalty
    L1_max_ct  = SC.get_max_lambda(X_control,Y_pre_control,X_treat=X_treated,Y_treat=Y_pre_treated)
-    L1_max_loo = SC.get_max_lambda(X_and_Y_pre,Y_post)
-    print("Max L1")
+    if False:
+        L1_max_loo = SC.get_max_lambda(X_and_Y_pre[np.arange(100)],Y_post[np.arange(100)])
+        print("Max L1 loo %s " % L1_max_loo)
+    else:
+        L1_max_loo = np.float(147975295.9121998)

    if False:
        "Demonstrate relations between the L1 and L2 penalties"
@ -182,6 +188,7 @@ if __name__ == "__main__":
    # ------------------------------------------------------------

    if False:
+
        print("starting grid scoring for treat / control scenario", grid*L1_max_ct)
        grid_scores_ct = SC.CV_score(
            X = X_control,
@ -208,8 +215,36 @@ if __name__ == "__main__":
        best_L1_penalty_ct = (grid * L1_max_ct)[np.argmin(grid_scores_ct)]

    if False:
+
+        print("Starting grid scoring for Controls Only scenario with 5-fold gradient descent", grid*L1_max_ct)
+        grid_scores_loo = SC.CV_score(
+            X = X_and_Y_pre, # limit the amount of time...
+            Y = Y_post     , # limit the amount of time...
+
+            # this is what enables the k-fold gradient descent
+            grad_splits = 5,
+
+            # L1 Penalty. if LAMBDA is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
+            LAMBDA = grid * L1_max_loo,
+
+            # L2 Penalty (float)
+            L2_PEN_W = L2_pen_start_loo,
+
+            # CACHE THE V MATRIX BETWEEN LAMBDA PARAMETERS (generally faster, but path dependent)
+            #cache = True, # False by Default
+
+            # Run each of the Cross-validation folds in parallel? Often slower
+            # for large sample sizes because numpy.linalg.solve() already runs
+            # in parallel for large matrices
+            parallel=False,
+
+            # ANNOUNCE COMPLETION OF EACH ITERATION
+            progress = True)
+
+    if False:
+
        # even with smaller data, this takes a while.
-        print("starting grid scoring for Controls Only scenario", grid*L1_max_ct)
+        print("Starting grid scoring for Controls Only scenario with leave-one-out gradient descent", grid*L1_max_ct)
        grid_scores_loo = SC.CV_score(
            X = X_and_Y_pre [np.arange(100)], # limit the amount of time...
            Y = Y_post      [np.arange(100)], # limit the amount of time...
@ -295,7 +330,6 @@ if __name__ == "__main__":
        import pdb; pdb.set_trace()


-
    # ---------------------------------------------------------------------------
    # ---------------------------------------------------------------------------
    # Optimization of the L1 and L2 parameters together (second order)
@ -306,9 +340,12 @@ if __name__ == "__main__":
    import time

    # -----------------------------------------------------------------
-    # Optimization of the L2 and L1 Penalties Simultaneously, keeping their product constant
+    # Optimization of the L2 and L1 Penalties Simultaneously, keeping their
+    # product constant.  Heuristically, this has been most efficient means of
+    # optimizing the L2 Parameter.
    # -----------------------------------------------------------------

+
    # build the objective function to be minimized

    # cache for L2_obj_func
--- a/fit_fold.py
+++ b/fit_fold.py
@ -20,8 +20,7 @@ def fold_v_matrix(X,
                 method = cdl_search, 
                 intercept = True,
                 max_lambda = False,  # this is terrible at least without documentation...
-                 solve_method = "standard",
-                 splits = 5,
+                 grad_splits = 5,
                 **kwargs):
    '''
    Computes and sets the optimal v_matrix for the given moments and 
@ -33,11 +32,18 @@ def fold_v_matrix(X,
    :param treated_units: a list containing the position (rows) of the treated units within X and Y
    :param control_units: a list containing the position (rows) of the control units within X and Y
    :param start: initial values for the diagonals of the tensor matrix
-    :param L2_PEN_W: L2 penalty on the magnitude of the deviance of the weight vector from null. Optional.
-    :param method: The name of a method to be used by scipy.optimize.minimize, or a callable with the same API as scipy.optimize.minimize
-    :param intercept: If True, weights are penalized toward the 1 / the number of controls, else weights are penalized toward zero
-    :;aram max_lambda: if True, the return value is the maximum L1 penalty for which at least one element of the tensor matrix is non-zero
-    :;aram solve_method: Method for solving A.I.dot(B). Either "standard" or "step-down".
+    :param L2_PEN_W: L2 penalty on the magnitude of the deviance of the weight
+                     vector from null. Optional.
+    :param method: The name of a method to be used by scipy.optimize.minimize,
+                   or a callable with the same API as scipy.optimize.minimize
+    :param intercept: If True, weights are penalized toward the 1 / the number
+                    of controls, else weights are penalized toward zero
+    :;aram max_lambda: if True, the return value is the maximum L1 penalty for
+                       which at least one element of the tensor matrix is
+                       non-zero
+    :;aram grad_splits: Splits for Fitted v.s. Control units in each gradient
+                        descent step. An integer, or a list/generator of train
+                        and test units in each fold of the gradient descent.
    :param **kwargs: additional arguments passed to the optimizer

    '''
@ -45,7 +51,7 @@ def fold_v_matrix(X,
    if treated_units is None: 
        if control_units is None: 
            # Neither provided; INCLUDE ALL SAMPLES AS BOTH TREAT AND CONTROL UNIT. 
-            # (this is the typical controls-only Loo V-matrix estimation)
+            # (this is the typical controls-only fold V-matrix estimation)
            control_units = list(range(X.shape[0]))
            treated_units = control_units 
        else:
@ -69,11 +75,14 @@ def fold_v_matrix(X,
    if not isinstance(L2_PEN_W, (float, int)): raise TypeError( "L2_PEN_W is not a number")
    assert not non_neg_weights, "Bounds not implemented"

+    splits = grad_splits # for readability...
    if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
        from sklearn.model_selection import KFold
-        splits = KFold(splits).split(np.arange(X_treat.shape[0]))
+        splits = KFold(splits).split(np.arange(len(control_units)))
    splits = list(splits)
-    n_splits = len(grad_splits)
+
+    for i in range(len(splits)):
+        assert len(splits[i][0]) + len(splits[i][1]) == len(treated_units), ("Splits for fold %s do not match the number of treated units" % i)

    # CONSTANTS
    C, N, K = len(control_units), len(treated_units), X.shape[1]
@ -83,79 +92,97 @@ def fold_v_matrix(X,
    assert K > 0; "variables to fit (X.shape[1] == 0)"

    # CREATE THE INDEX THAT INDICATES THE ELIGIBLE CONTROLS FOR EACH TREATED UNIT
-    in_controls = [list(set(control_units) - set([trt_unit])) for trt_unit in treated_units]
+    in_controls = [list(set(control_units) - set(treated_units[test])) for train,test in splits]
    in_controls2 = [np.ix_(i,i) for i in in_controls] # this is a much faster alternative to A[:,index][index,:]
    ctrl_rng = np.arange(len(control_units))
-    out_controls = [ctrl_rng[control_units != trt_unit] for trt_unit in treated_units] 
-    out_treated  = [ctrl_rng[control_units == trt_unit] for trt_unit in treated_units] # this is non-trivial when there control units are also being predicted.
+    out_controls = [ctrl_rng[np.logical_not(np.isin(control_units, treated_units[test]))] for train,test in splits] 
+    out_treated  = [ctrl_rng[               np.isin(control_units, treated_units[test]) ] for train,test in splits] # this is non-trivial when there control units are also being predicted.

    if intercept:
-        for i, trt_unit in enumerate(treated_units):
-            Y[trt_unit,:] -= Y[in_controls[i],:].mean(axis=0) 
+        for in_ctrl, (train, test) in zip(in_controls,splits):
+            Y[treated_units[test],:] -= Y[in_ctrl,:].mean(axis=0) 

+    # handy constants (for speed purposes):
+    Y_treated = Y[treated_units,:]
+    Y_control = Y[control_units,:]

+    # INITIALIZE PARTIAL DERIVATIVES
+    dA_dV_ki = [ [None,] *N for i in range(K)]
+    dB_dV_ki = [ [None,] *N for i in range(K)]
+    b_i = [None,] *N 
+    for i, k in  itertools.product(range(len(splits)), range(K)): # TREATED unit i, moment k
+        train, test = splits[i]
+        Xc = X[in_controls[i], : ]
+        Xt = X[treated_units[test], : ]
+        dA_dV_ki [k][i] = Xc[:, k ].dot(Xc[:, k ].T) + Xc[:, k ].dot(Xc[:, k ].T) # 8
+        dB_dV_ki [k][i] = Xc[:, k ].dot(Xt[:, k ].T) + Xc[:, k ].dot(Xt[:, k ].T) # 9

+    del Xc, Xt, i, k

-def __old_fold_v_matrix(Xt,Xc,Yt,Yc,LAMBDA = .1,L2_PEN_W = .1,guess = None,method = 'Newton-CG',**kwargs):
-    """ Calculates the V matrix with an explicit Treatment and Control folds,
-        where the match is optimized on the error of the outcomes.
+    def _score(V):
+        dv = diag(V)
+        weights, _, _ = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        return ((Ey **2).sum() + LAMBDA * absolute(V).sum()).copy() # (...).copy() assures that x.flags.writeable is True

-        don't use this. it hasn't been thoroughly been checked for bugs (compared
-        to loo_v_matrix and ct_v_matrix) and lacks several of the optimizations of loo_v_matrix
-    """
-    C,K = Xc.shape; N,T = Yt.shape
-    two_L2_PEN_W = 2*L2_PEN_W*diag(ones(C))
-    def _score(diagV):
-        """ calculates just the diagonal of dGamma0_dV
+    def _grad(V):
+        """ Calculates just the diagonal of dGamma0_dV
+
+            There is an implementation that allows for all elements of V to be varied...
        """
-        V = diag(diagV)
-        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
-        B = Xc.dot(2*V).dot(Xt.T) # 6
-        Pi = A.I.dot(B) 
-        Ey = (Yt - Pi.T.dot(Yc)).getA()
-        return (Ey **2).sum() + LAMBDA * absolute(Pi.T).sum()
-    def _grad(diagV):
-        """ calculates just the diagonal of dGamma0_dV
-        """
-        V = diag(diagV)
-        assert Xc.shape[0] == Yc.shape[0]
-        # CALCULATE CONSTANTS
-        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
-        B = Xc.dot(2*V).dot(Xt.T) # 6
-        Ai = A.I
-        Pi = Ai.dot(B) 
-        Ey = (Yt - Pi.T.dot(Yc)).getA()
-        # INITIALIZE PARTIAL DERIVATIVES
-        dGamma0_dV_term2 = zeros((K,K))
-        for i in range(K):
-            dA = Xc[:, i ].dot(Xc[:, i ].T) + Xc[:, i ].dot(Xc[:, i ].T) # 8
-            dB = Xc[:, i ].dot(Xt [:, i ].T) + Xc[:, i ].dot(Xt [:, i ].T) # 9
-            dPI_dV = Ai.dot(dB - dA.dot(Pi)) # 7
-            dGamma0_dV_term2[i,i] = (Ey * Yc.T.dot(dPI_dV).T.getA()).sum()
-        return diag(LAMBDA * (1 - 2*(V<0)) - 2 * dGamma0_dV_term2)
+        dv = diag(V)
+        weights, A, B = _weights(dv)
+        Ey = (Y_treated - weights.T.dot(Y_control)).getA()
+        dGamma0_dV_term2 = zeros(K)
+        dPI_dV = zeros((C, N))
+        for k in range(K):
+            dPI_dV.fill(0) # faster than re-allocating the memory each loop.
+            for i, (index, (train, test)) in enumerate(zip(in_controls,splits)):
+                dA = dA_dV_ki[k][i]
+                dB = dB_dV_ki[k][i]
+                b = linalg.solve(A[in_controls2[i]],dB - dA.dot(b_i[i]))
+                dPI_dV[np.ix_(in_controls[i], treated_units[test])] = b
+            dGamma0_dV_term2[k] = (Ey * Y_control.T.dot(dPI_dV).T.getA()).sum()
+        return LAMBDA - 2 * dGamma0_dV_term2 # (...).copy() assures that x.flags.writeable is True // * (1 - 2*(V < 0)) was droped b/c we're restricting to positive V's
+
+    L2_PEN_W_mat = 2 * L2_PEN_W * diag(ones(C))
    def _weights(V):
-        A = Xc.dot(2*V).dot(Xc.T) + two_L2_PEN_W # 5
-        B = Xc.dot(2*V).dot(Xt.T) # 6
-        #for col in treated_unit:
-            #w_col = A[-col,-col].I.dot(B[:,col])
-        return A.I.dot(B).T
+        weights = zeros((C, N))
+        A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
+        B = X.dot(V + V.T).dot(X.T).T # 6
+        for i, (train,test) in enumerate(splits):
+            (b) = b_i[i] = linalg.solve(A[in_controls2[i]], B[np.ix_(in_controls[i], treated_units[test])])
+            weights[np.ix_(out_controls[i], test)] = b
+        return weights, A, B
+
+    if max_lambda:
+        grad0 = _grad(zeros(K))
+        return -grad0[grad0 < 0].min()
+
    # DO THE OPTIMIZATION
-    opt = minimize(_score, zeros(Xc.shape[1]) if guess is None else guess,
-                   jac = _grad,
-                   method = method, **kwargs)
+    if isinstance(method, str):
+        from scipy.optimize import minimize
+        opt = minimize(_score, start.copy(), jac = _grad, method = method, **kwargs)
+    else:
+        assert callable(method), "Method must be a valid method name for scipy.optimize.minimize or a minimizer"
+        opt = method(_score, start.copy(), jac = _grad, **kwargs)
    v_mat = diag(opt.x)
-    ts_loss = opt.fun
    # CALCULATE weights AND ts_score
-    weights = _weights(v_mat)
-    errors = Yt - weights.dot(Yc)
+    weights, _, _ = _weights(v_mat)
+    errors = Y_treated - weights.T.dot(Y_control)
+    ts_loss = opt.fun
    ts_score = linalg.norm(errors) / sqrt(prod(errors.shape))
-    return weights, v_mat, ts_score, ts_loss, opt
+
+    if intercept:
+        for i in range(len(splits)):
+            weights[out_controls[i], i] += 1/len(out_controls[i])
+
+    return weights, v_mat, ts_score, ts_loss, L2_PEN_W, opt



-
-
-def loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True, solve_method = "standard"):
+# NOTE: this is here for completeness, but you will generally want to use loo_weights instead. a single set of weights is a lot less costly than a full gradient descent...
+def fold_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True, splits = 5):
    if treated_units is None: 
        if control_units is None: 
            # both not provided, include all samples as both treat and control unit.
@ -172,48 +199,41 @@ def loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, inte
    treated_units = np.array(treated_units)
    [C, N, K] = [len(control_units), len(treated_units), X.shape[1]]

+    if isinstance(splits, (int)) or np.issubdtype(splits, np.integer): 
+        from sklearn.model_selection import KFold
+        splits = KFold(splits).split(np.arange(len(control_units)))
+    splits = list(splits)

    # index with positions of the controls relative to the incoming data
-    in_controls = [list(set(control_units) - set([trt_unit])) for trt_unit in treated_units]
+    in_controls = [list(set(control_units) - set(treated_units[test])) for train,test in splits]
    in_controls2 = [np.ix_(i,i) for i in in_controls] # this is a much faster alternative to A[:,index][index,:]

    # index of the controls relative to the rows of the outgoing C x N matrix of weights
    ctrl_rng = np.arange(len(control_units))
-    out_controls = [ctrl_rng[control_units != trt_unit] for trt_unit in treated_units] 
-    out_treated  = [ctrl_rng[control_units == trt_unit] for trt_unit in treated_units] # this is non-trivial when there control units are also being predicted.
+    out_controls = [ctrl_rng[np.logical_not(np.isin(control_units, treated_units[test]))] for train,test in splits] 
+    out_treated  = [ctrl_rng[               np.isin(control_units, treated_units[test]) ] for train,test in splits] # this is non-trivial when there control units are also being predicted.

    # constants for indexing
    X_control = X[control_units,:]
    X_treat = X[treated_units,:]
    weights = zeros((C, N))

-    if solve_method == "step-down":
-        A = X_control.dot(V + V.T).dot(X_control.T) + 2 * L2_PEN_W * diag(ones(X_control.shape[0])) # 5
-        B = X_treat.dot(  V + V.T).dot(X_control.T) # 6
-        Ai = A.I
-        for i, trt_unit in enumerate(treated_units):
-            if trt_unit in control_units:
-                (b) = subinv_k(Ai,_k).dot(B[out_controls[i],i])
-            else:
-                (b) = Ai.dot(B[:, i])
+    A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
+    B = X.dot(V + V.T).dot(X.T).T # 6

-            weights[out_controls[i], i] = b.flatten()
-            if intercept:
-                weights[out_controls[i], i] += 1/len(out_controls[i])
-    elif solve_method == "standard":
-        A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
-        B = X.dot(V + V.T).dot(X.T).T # 6
-        for i, trt_unit in enumerate(treated_units):
-            (b) = linalg.solve(A[in_controls2[i]], B[in_controls[i], trt_unit])
-
-            weights[out_controls[i], i] = b.flatten()
-            if intercept:
-                weights[out_controls[i], i] += 1/len(out_controls[i])
-    else:
-        raise ValueError("Unknown Solve Method: " + solve_method)
+    for i, (train,test) in enumerate(splits):
+        (b) = linalg.solve(A[in_controls2[i]], B[np.ix_(in_controls[i], treated_units[test])])
+        indx2 = np.ix_(out_controls[i], train)
+        weights[indx2] = b.T
+        if intercept:
+            weights[indx2] += 1/len(out_controls[i])
    return weights.T

-def loo_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units = None,**kwargs):
+
+
+
+# NOTE: this is here for completeness, but you will almost always want to use loo_score instead. 
+def fold_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units = None,**kwargs):
    if treated_units is None: 
        if control_units is None: 
            # both not provided, include all samples as both treat and control unit.
@ -226,7 +246,7 @@ def loo_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units
        if control_units is None: 
            # Set the control units to the not-treated units
            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
-    weights = loo_weights(X = X,
+    weights = fold_weights(X = X,
                           V = V,
                           L2_PEN_W = L2_PEN_W,
                           treated_units = treated_units,
--- a/fit_loo.py
+++ b/fit_loo.py
@ -35,7 +35,7 @@ def loo_v_matrix(X,
    :param method: The name of a method to be used by scipy.optimize.minimize, or a callable with the same API as scipy.optimize.minimize
    :param intercept: If True, weights are penalized toward the 1 / the number of controls, else weights are penalized toward zero
    :;aram max_lambda: if True, the return value is the maximum L1 penalty for which at least one element of the tensor matrix is non-zero
-    :;aram solve_method: Method for solving A.I.dot(B). Either "standard" or "step-down".
+    :;aram solve_method: Method for solving A.I.dot(B). Either "standard" or "step-down". https://math.stackexchange.com/a/208021/252693
    :param **kwargs: additional arguments passed to the optimizer

    '''
@ -246,40 +246,6 @@ def loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, inte
    return weights.T


-
-def __old__loo_weights(X, V, L2_PEN_W, treated_units = None, control_units = None, intercept = True):
-    assert False
-    if treated_units is None: 
-        if control_units is None: 
-            # both not provided, include all samples as both treat and control unit.
-            control_units = list(range(X.shape[0]))
-            treated_units = control_units 
-        else:
-            # Set the treated units to the not-control units
-            treated_units = list(set(range(X.shape[0])) - set(control_units))  
-    else:
-        if control_units is None: 
-            # Set the control units to the not-treated units
-            control_units = list(set(range(X.shape[0])) - set(treated_units)) 
-    [C, N, K] = [len(control_units), len(treated_units), X.shape[1]]
-    index_i = [list(set(list(range(200))) - set([col])) for col in treated_units]
-
-    index_i2 = [np.ix_(i,i) for i in index_i] # this is a much faster alternative to A[:,index][index,:]
-    index_map = {c_unit : row_at for row_at, c_unit in enumerate(control_units)}
-
-    weights = zeros((C, N))
-    A = X.dot(V + V.T).dot(X.T) + 2 * L2_PEN_W * diag(ones(X.shape[0])) # 5
-    B = X.dot(V + V.T).dot(X.T).T # 6
-    for col, index in enumerate(index_i):
-        b = linalg.solve(A[index_i2[col]], B[index, col])
-        weights[np.vectorize(index_map.get)(index), col] = b.T
-        if intercept:
-            weights[index, col] += 1/len(index)
-    return weights
-
-
-
-
 def loo_score(Y, X, V, L2_PEN_W, LAMBDA = 0, treated_units = None, control_units = None,**kwargs):
    if treated_units is None: 
        if control_units is None: