SparseSC/example-code.py

393 строки
17 KiB
Python
Исходник Постоянная ссылка Обычный вид История

2018-05-19 00:08:01 +03:00
"""
USAGE:
2018-10-05 20:22:32 +03:00
cd path/to/SparseSC
2018-05-19 00:08:01 +03:00
python example-code.py
"""
import os
import sys
2018-10-05 20:22:32 +03:00
#sys.path.append(os.path.join(os.getcwd(),"SparseSC"))
2018-05-19 00:08:01 +03:00
import time
2018-10-05 20:22:32 +03:00
import SparseSC as SC
from SparseSC.tests import ge_dgp
2018-05-19 00:08:01 +03:00
import numpy as np
import random
2018-10-05 20:10:52 +03:00
# def setup(N0,N1,T,K,g,gs,bs ): # controls (N0), treated units (N1) , time periods (T), Predictors (K), groups (g), g-scale, b-scale
2018-05-19 00:08:01 +03:00
if __name__ == "__main__":
# ------------------------------------------------------------
# ------------------------------------------------------------
2018-05-22 01:42:54 +03:00
# SECTION 1: GENERATE SOME TOY DATA
# ------------------------------------------------------------
# ------------------------------------------------------------
# CONTROL PARAMETERS
2018-05-19 00:08:01 +03:00
random.seed(12345)
np.random.seed(10101)
# Controls (per group), Treated (per group), pre-intervention Time points, post intervention time-periods
2018-10-05 20:10:52 +03:00
N0,N1,T0,T1 = 5,5,4,4
2018-05-19 00:08:01 +03:00
# Causal Covariates, Confounders , Random Covariates
K,S,R = 7,4,5
# Number of Groups, Scale of the Group Effect, (groups make the time series correlated for reasons other than the observed covariates...)
groups,group_scale = 30,2
# Scale of the Correlation of the Causal Covariates
beta_scale,confounders_scale = 4,1
X_control, X_treated, Y_pre_control, Y_pre_treated, \
2018-10-05 20:10:52 +03:00
Y_post_control, Y_post_treated = ge_dgp(N0,N1,T0,T1,K,S,R,groups,group_scale,beta_scale,confounders_scale,model= "full")
2018-05-19 00:08:01 +03:00
# JOIN THE TREAT AND CONTROL DATA
X = np.vstack( (X_control, X_treated,) )
Y_pre = np.vstack( (Y_pre_control, Y_pre_treated, ) )
Y_post = np.vstack( (Y_post_control, Y_post_treated,) )
# in the leave-one-out scneario, the pre-treatment outcomes will be part of the covariates
X_and_Y_pre = np.hstack( ( X, Y_pre,) )
X_and_Y_pre_control = np.hstack( ( X_control, Y_pre_control,) )
2018-05-19 00:08:01 +03:00
# IDENTIFIERS FOR TREAT AND CONTROL UNITS
2018-10-05 20:10:52 +03:00
# control_units = np.arange( N0 * groups )
# treated_units = np.arange( N1 * groups ) + N0
2018-05-19 00:08:01 +03:00
2018-05-22 01:42:54 +03:00
# ------------------------------------------------------------
2018-05-19 00:08:01 +03:00
# ------------------------------------------------------------
# find default penalties
# ------------------------------------------------------------
2018-05-22 01:42:54 +03:00
# ------------------------------------------------------------
2018-05-19 00:08:01 +03:00
# get starting point for the L2 penalty
w_pen_start_ct = SC.w_pen_guestimate(X_control)
w_pen_start_loo = SC.w_pen_guestimate(X_and_Y_pre_control)
2018-05-19 00:08:01 +03:00
# get the maximum value for the L1 Penalty parameter conditional on the guestimate for the L2 penalty
L1_max_ct = SC.get_max_v_pen(X_control,Y_pre_control,X_treat=X_treated,Y_treat=Y_pre_treated)
2018-05-22 01:42:54 +03:00
if False:
L1_max_loo = SC.get_max_v_pen(X_and_Y_pre_control[np.arange(100)],Y_post[np.arange(100)])
2018-05-22 01:42:54 +03:00
print("Max L1 loo %s " % L1_max_loo)
else:
L1_max_loo = np.float(147975295.9121998)
2018-05-19 00:08:01 +03:00
if False:
"Demonstrate relations between the L1 and L2 penalties"
# get the maximum value for the L1 Penalty parameter conditional on several L2 penalty parameter values
L2_grid = (2.** np.arange(-1,2))
L1_max_loo_grid = SC.get_max_v_pen(X_and_Y_pre,
2018-05-19 00:08:01 +03:00
Y_post,
w_pen = w_pen_start_loo * L2_grid)
L1_max_ct_grid = SC.get_max_v_pen(X_control,
2018-05-19 00:08:01 +03:00
Y_pre_control,
X_treat=X_treated,
Y_treat=Y_pre_treated,
w_pen = w_pen_start_ct * L2_grid)
2018-05-19 00:08:01 +03:00
assert ( L1_max_loo / L1_max_loo_grid == 1/L2_grid).all()
assert ( L1_max_ct / L1_max_ct_grid == 1/L2_grid).all()
# ------------------------------------------------------------
# create a grid of penalties to try
# ------------------------------------------------------------
n_points = 10 # number of points in the grid
grid_type = "log-linear" # Usually the optimal penalties are quite Small
if grid_type == "simple":
# An equally spaced linear grid that does not include 0 or 1
grid = ( 1 + np.arange(n_points) ) / ( 1 + n_points )
elif grid_type == "linear":
# Another equally spaced linear grid
fmax = 1e-3 # lowest point in the grid relative to the max-lambda
fmin = 1e-5 # highest point in the grid relative to the max-lambda
grid = np.linspace(fmin,fmax,n_points)
elif grid_type == "log-linear":
# Another equally spaced linear grid
fmax = 1e-2 # lowest point in the grid relative to the max-lambda
fmin = 1e-4 # highest point in the grid relative to the max-lambda
grid = np.exp(np.linspace(np.log(fmin),np.log(fmax),n_points))
else:
raise ValueError("Unknown grid type: %s" % grid_type)
# ------------------------------------------------------------
# get the Cross Validation Error over a grid of L1 penalty
# parameters using both (a) Treat/Control and (b) the
# leave-one-out controls only methods
# ------------------------------------------------------------
if False:
2018-05-22 01:42:54 +03:00
2018-05-19 00:08:01 +03:00
print("starting grid scoring for treat / control scenario", grid*L1_max_ct)
grid_scores_ct = SC.CV_score(
X = X_control,
Y = Y_pre_control,
X_treat = X_treated,
Y_treat = Y_pre_treated,
2019-03-05 00:20:02 +03:00
# if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
v_pen = grid * L1_max_ct,
w_pen = w_pen_start_ct,
2018-05-19 00:08:01 +03:00
2019-03-05 00:20:02 +03:00
# CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
2018-05-19 00:08:01 +03:00
cache = False, # False by Default
# Run each of the Cross-validation folds in parallel? Often slower
# for large sample sizes because numpy.linalg.solve() already runs
# in parallel for large matrices
parallel=False,
# ANNOUNCE COMPLETION OF EACH ITERATION
progress = True)
best_L1_penalty_ct = (grid * L1_max_ct)[np.argmin(grid_scores_ct)]
if False:
2018-05-22 01:42:54 +03:00
print("Starting grid scoring for Controls Only scenario with 5-fold gradient descent", grid*L1_max_loo)
2018-05-22 01:42:54 +03:00
grid_scores_loo = SC.CV_score(
X = X_and_Y_pre_control, # limit the amount of time...
Y = Y_post_control , # limit the amount of time...
2018-05-22 01:42:54 +03:00
# this is what enables the k-fold gradient descent
grad_splits = 5,
2018-05-22 23:27:28 +03:00
random_state = 10101, # random_state for the splitting during k-fold gradient descent
2018-05-22 01:42:54 +03:00
2019-03-05 00:20:02 +03:00
# L1 Penalty. if v_pen is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
v_pen = grid * L1_max_loo,
2018-05-22 01:42:54 +03:00
# L2 Penalty (float)
w_pen = w_pen_start_loo,
2018-05-22 01:42:54 +03:00
2019-03-05 00:20:02 +03:00
# CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
2018-05-22 01:42:54 +03:00
#cache = True, # False by Default
# Run each of the Cross-validation folds in parallel? Often slower
# for large sample sizes because numpy.linalg.solve() already runs
# in parallel for large matrices
parallel=False,
2018-05-22 23:27:28 +03:00
# announce each call to `numpy.linalg.solve(A,B)` (the major bottleneck)
verbose = False, # it's kind of obnoxious, but gives a sense of running time per gradient calculation
2018-05-22 01:42:54 +03:00
# ANNOUNCE COMPLETION OF EACH ITERATION
progress = True)
if False:
2018-05-19 00:08:01 +03:00
# even with smaller data, this takes a while.
print("Starting grid scoring for Controls Only scenario with leave-one-out gradient descent", grid*L1_max_loo)
2018-05-19 00:08:01 +03:00
grid_scores_loo = SC.CV_score(
X = X_and_Y_pre_control [np.arange(100)], # limit the amount of time...
Y = Y_post_control [np.arange(100)], # limit the amount of time...
2018-05-19 00:08:01 +03:00
2018-05-22 23:27:28 +03:00
# with `grad_splits = None` (the default behavior) we get leave-one-out gradient descent.
grad_splits = None,
2019-03-05 00:20:02 +03:00
# L1 Penalty. if v_pen is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
v_pen = grid * L1_max_loo,
2018-05-19 00:08:01 +03:00
# L2 Penalty (float)
w_pen = w_pen_start_loo,
2018-05-19 00:08:01 +03:00
2019-03-05 00:20:02 +03:00
# CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
2018-05-19 00:08:01 +03:00
#cache = True, # False by Default
# Run each of the Cross-validation folds in parallel? Often slower
# for large sample sizes because numpy.linalg.solve() already runs
# in parallel for large matrices
parallel=False,
2018-05-22 23:27:28 +03:00
# announce each call to `numpy.linalg.solve(A,B)` (the major bottleneck)
verbose = False, # it's kind of obnoxious, but gives a sense of running time per gradient calculation
2018-05-19 00:08:01 +03:00
# ANNOUNCE COMPLETION OF EACH ITERATION
progress = True)
# ---------------------------------------------------------------------------
# Calculate Synthetic Control weights for a fixed pair of penalty parameters
# ---------------------------------------------------------------------------
# This is a two-step process because in principle, we can estimate a
# tensor matrix (which contains the relative weights of the covariates and
# possibly pre-treatment outcomes) in one population, and apply it to
# another population.
best_L1_penalty_ct = np.float(1908.9329)
# -----------------------------------
# Treat/Control:
# -----------------------------------
V_ct = SC.tensor(X = X_control,
Y = Y_pre_control,
X_treat = X_treated,
Y_treat = Y_pre_treated,
2019-03-05 00:20:02 +03:00
v_pen = best_L1_penalty_ct,
w_pen = w_pen_start_ct)
2018-05-19 00:08:01 +03:00
SC_weights_ct = SC.weights(X = X_control,
X_treat = X_treated,
V = V_ct,
w_pen = w_pen_start_ct)
2018-05-19 00:08:01 +03:00
Y_post_treated_synthetic_conrols_ct = SC_weights_ct.dot(Y_post_control)
ct_prediction_error = Y_post_treated_synthetic_conrols_ct - Y_post_treated
null_model_error = Y_post_treated - np.mean(Y_pre_treated)
R_squared_post_ct = 1 - np.power(ct_prediction_error,2).sum() / np.power(null_model_error,2).sum()
print( "C/T: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_ct,))
# -----------------------------------
# Leave-One-Out with Control Only:
# -----------------------------------
if False:
# this takes a while:
V_loo = SC.tensor(
X = X_and_Y_pre [np.arange(100)], # limit the amount of time...
Y = Y_post [np.arange(100)], # limit the amount of time...
2019-03-05 00:20:02 +03:00
v_pen = best_L1_penalty_loo,
w_pen = w_pen_start_loo)
2018-05-19 00:08:01 +03:00
SC_weights_loo = SC.weights(X = X_control,
V = V_loo,
w_pen = w_pen_start_loo)
2018-05-19 00:08:01 +03:00
# in progress...
import pdb; pdb.set_trace()
Y_post_treated_synthetic_conrols_loo = SC_weights_loo.dot(Y_post_control)
2018-05-19 00:08:01 +03:00
loo_prediction_error = Y_post_treated_synthetic_conrols_loo - Y_post_treated
null_model_error = Y_post_treated - np.mean(Y_pre_treated)
R_squared_post_loo = 1 - np.power(loo_prediction_error,2).sum() / np.power(null_model_error,2).sum()
2018-05-19 00:08:01 +03:00
print( "LOO: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_loo,))
2018-05-19 00:08:01 +03:00
import pdb; pdb.set_trace()
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# Optimization of the L1 and L2 parameters together (second order)
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
from scipy.optimize import fmin_l_bfgs_b, differential_evolution
import time
# -----------------------------------------------------------------
2018-05-22 01:42:54 +03:00
# Optimization of the L2 and L1 Penalties Simultaneously, keeping their
# product constant. Heuristically, this has been most efficient means of
# optimizing the L2 Parameter.
2018-05-19 00:08:01 +03:00
# -----------------------------------------------------------------
if False:
# build the objective function to be minimized
# cache for L2_obj_func
n_calls = [0,]
temp_results =[]
SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
best_L1_penalty_ct = np.float(1908.9329)
def L1_L2_const_obj_func (x):
n_calls[0] += 1
t1 = time.time();
score = SC.CV_score(X = X_control,
Y = Y_pre_control,
X_treat = X_treated,
Y_treat = Y_pre_treated,
2019-03-05 00:20:02 +03:00
# if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
v_pen = best_L1_penalty_ct * np.exp(x[0]),
w_pen = w_pen_start_ct / np.exp(x[0]),
# suppress the analysis type message
quiet = True)
t2 = time.time();
temp_results.append((n_calls[0],x,score))
print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" % (n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
#print("calls: %s, time: %0.4f, x0: %0.4f, x1: %0.4f, Cross Validation Error: %s, R-Squared: %s" % (n_calls[0], t2 - t1, x[0], x[1], score, 1 - score / SS ))
return score
# the actual optimization
print("Starting L1-L2 Penalty (product constant) optimization")
#results = differential_evolution(L1_L2_const_obj_func, bounds = ((-6,6,),) )
#results = differential_evolution(L1_L2_obj_func, bounds = ((-6,6,),)*2 )
2018-05-19 00:08:01 +03:00
import pdb; pdb.set_trace()
NEW_best_L1_penalty_ct = best_L1_penalty_ct * np.exp(results.x[0])
best_L2_penalty = w_pen_start_ct * np.exp(results.x[1])
2018-05-22 01:42:54 +03:00
print("DE optimized L2 Penalty: %s, DE optimized L1 penalty: %s" % (NEW_best_L1_penalty_ct, best_L2_penalty,) )
2018-05-19 00:08:01 +03:00
# -----------------------------------------------------------------
# Optimization of the L2 Parameter alone
# -----------------------------------------------------------------
if False:
# -----------------------
# build the objective function to be minimized
# -----------------------
# OBJECTIVE FUNCTION(S) TO MINIMIZE USING DE
# cache for L2_obj_func
n_calls = [0,]
temp_results =[]
SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
best_L1_penalty_ct = np.float(1908.9329)
def L2_obj_func (x):
n_calls[0] += 1
t1 = time.time();
score = SC.CV_score(X = X_control,
Y = Y_pre_control,
X_treat = X_treated,
Y_treat = Y_pre_treated,
2019-03-05 00:20:02 +03:00
# if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
v_pen = best_L1_penalty_ct,
w_pen = w_pen_start_ct * np.exp(x[0]),
# suppress the analysis type message
quiet = True)
2018-05-19 00:08:01 +03:00
t2 = time.time();
temp_results.append((n_calls[0],x,score))
print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" %
(n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
return score
2018-05-19 00:08:01 +03:00
# the actual optimization
print("Starting L2 Penalty optimization")
2018-05-19 00:08:01 +03:00
results = differential_evolution(L2_obj_func, bounds = ((-6,6,),))
NEW_L2_pen_start_ct = w_pen_start_ct * np.exp(results.x[0])
print("DE optimized L2 Penalty: %s, using fixed L1 penalty: %s" % (NEW_L2_pen_start_ct, best_L1_penalty_ct,) )
2018-05-19 00:08:01 +03:00
# -----------------------------------------------------------------
# Optimization of the L2 and L1 Penalties Simultaneously
# -----------------------------------------------------------------
best_L1_penalty_ct = np.float(1908.9329)
# the actual optimization
print("Starting L1-L2 Joint Penalty optimization")
2018-05-19 00:08:01 +03:00
results = SC.joint_penalty_optimzation(X = X_control, Y = Y_pre_control, L1_pen_start = best_L1_penalty_ct, L2_pen_start = w_pen_start_ct, bounds = ((-6,6,),)*2, X_treat = X_treated, Y_treat = Y_pre_treated)
2018-05-19 00:08:01 +03:00
import pdb; pdb.set_trace()
2018-10-04 23:45:01 +03:00
NEW_best_L1_penalty_ct = results.x[0]
best_L2_penalty = results.x[1]
2018-05-19 00:08:01 +03:00
print("DE optimized L2 Penalty: %s, DE optimized L1 penalty: %s" % (NEW_best_L1_penalty_ct, best_L2_penalty,) )