added a new simple-iv dataset for testing regression discontinuity method

This commit is contained in:
Amit Sharma 2020-01-07 14:14:48 +05:30
Родитель 6cab985d4a
Коммит 436273d15d
4 изменённых файлов: 422 добавлений и 53 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,4 +1,4 @@
"""Module for generating some sample datasets.
"""Module for generating some sample datasets.
"""
@ -37,7 +37,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
c2 = np.random.uniform(0, range_c2, num_common_causes)
if num_instruments > 0:
range_cz = beta*0.5
range_cz = beta
p = np.random.uniform(0, 1, num_instruments)
Z = np.zeros((num_samples, num_instruments))
for i in range(num_instruments):
@ -100,6 +100,88 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
data = data.astype({outcome: 'bool'}, copy=False)
# Now specifying the corresponding graph strings
dot_graph = create_dot_graph(treatments, outcome, common_causes, instruments, effect_modifiers)
# Now writing the gml graph
gml_graph = create_gml_graph(treatments, outcome, common_causes, instruments, effect_modifiers)
ret_dict = {
"df": data,
"treatment_name": treatments,
"outcome_name": outcome,
"common_causes_names": common_causes,
"instrument_names": instruments,
"effect_modifier_names": effect_modifiers,
"dot_graph": dot_graph,
"gml_graph": gml_graph,
"ate": ate
}
return ret_dict
def simple_iv_dataset(beta, num_samples,
num_treatments = 1,
treatment_is_binary=True,
outcome_is_binary=False):
""" Simple instrumental variable dataset with a single IV and a single confounder.
"""
W, Z, c1, c2, cz = [None]*5
num_instruments = 1
num_common_causes = 1
beta = float(beta)
# Making beta an array
if type(beta) not in [list, np.ndarray]:
beta = np.repeat(beta, num_treatments)
c1 = np.random.uniform(0,1, (num_common_causes, num_treatments))
c2 = np.random.uniform(0,1, num_common_causes)
range_cz = beta # cz is much higher than c1 and c2
cz = np.random.uniform(range_cz - (range_cz * 0.05),
range_cz + (range_cz * 0.05), (num_instruments, num_treatments))
W = np.random.uniform(0, 1, (num_samples, num_common_causes))
Z = np.random.normal(0, 1, (num_samples, num_instruments))
t = np.random.normal(0, 1, (num_samples, num_treatments)) + Z @ cz + W @ c1
if treatment_is_binary:
t = np.vectorize(stochastically_convert_to_binary)(t)
def _compute_y(t, W, beta, c2):
y = t @ beta + W @ c2
return y
y = _compute_y(t, W, beta, c2)
# creating data frame
data = np.column_stack((Z, W, t, y))
treatments = [("v" + str(i)) for i in range(0, num_treatments)]
outcome = "y"
common_causes = [("W" + str(i)) for i in range(0, num_common_causes)]
ate = np.mean(_compute_y(np.ones((num_samples, num_treatments)), W, beta, c2 ) - _compute_y(np.zeros((num_samples, num_treatments)), W, beta, c2))
instruments = [("Z" + str(i)) for i in range(0, num_instruments)]
other_variables = None
col_names = instruments + common_causes + treatments + [outcome]
data = pd.DataFrame(data, columns=col_names)
# Specifying the correct dtypes
if treatment_is_binary:
data = data.astype({tname:'bool' for tname in treatments}, copy=False)
if outcome_is_binary:
data = data.astype({outcome: 'bool'}, copy=False)
# Now specifying the corresponding graph strings
dot_graph = create_dot_graph(treatments, outcome, common_causes, instruments)
# Now writing the gml graph
gml_graph = create_gml_graph(treatments, outcome, common_causes, instruments)
ret_dict = {
"df": data,
"treatment_name": treatments,
"outcome_name": outcome,
"common_causes_names": common_causes,
"instrument_names": instruments,
"effect_modifier_names": None,
"dot_graph": dot_graph,
"gml_graph": gml_graph,
"ate": ate
}
return ret_dict
def create_dot_graph(treatments, outcome, common_causes,
instruments, effect_modifiers=[]):
dot_graph = ('digraph {{'
' U[label="Unobserved Confounders"];'
' U->{0};'
@ -112,7 +194,10 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
dot_graph += " ".join([v + "-> " + outcome + ";" for v in common_causes])
dot_graph += " ".join([v + "-> " + outcome + ";" for v in effect_modifiers])
dot_graph = dot_graph + "}"
# Now writing the gml graph
return dot_graph
def create_gml_graph(treatments, outcome, common_causes,
instruments, effect_modifiers=[]):
gml_graph = ('graph[directed 1'
'node[ id "{0}" label "{0}"]'
'node[ id "{1}" label "{1}"]'
@ -132,18 +217,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
gml_graph = gml_graph + " ".join(['edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in common_causes])
gml_graph = gml_graph + " ".join(['node[ id "{0}" label "{0}"] edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in effect_modifiers])
gml_graph = gml_graph + ']'
ret_dict = {
"df": data,
"treatment_name": treatments,
"outcome_name": outcome,
"common_causes_names": common_causes,
"instrument_names": instruments,
"effect_modifier_names": effect_modifiers,
"dot_graph": dot_graph,
"gml_graph": gml_graph,
"ate": ate
}
return ret_dict
return gml_graph
def xy_dataset(num_samples, effect=True, sd_error=1):
treatment = 'Treatment'

Просмотреть файл

@ -18,7 +18,8 @@ class TestEstimator(object):
treatment_is_binary=True,
outcome_is_binary=False,
method_params=None):
data = dowhy.datasets.linear_dataset(beta=beta,
if dataset == "linear":
data = dowhy.datasets.linear_dataset(beta=beta,
num_common_causes=num_common_causes,
num_instruments=num_instruments,
num_effect_modifiers = num_effect_modifiers,
@ -26,6 +27,14 @@ class TestEstimator(object):
num_samples=num_samples,
treatment_is_binary=treatment_is_binary,
outcome_is_binary = outcome_is_binary)
elif dataset == "simple-iv":
data = dowhy.datasets.simple_iv_dataset(beta=beta,
num_treatments = num_treatments,
num_samples = num_samples,
treatment_is_binary=treatment_is_binary,
outcome_is_binary = outcome_is_binary)
else:
raise ValueError("Dataset type not supported.")
model = CausalModel(
data=data['df'],
@ -64,6 +73,7 @@ class TestEstimator(object):
num_effect_modifiers=[0,], num_treatments=[1,],
treatment_is_binary=[True,],
outcome_is_binary=[False,],
dataset = "linear",
method_params=None):
args_dict = {
'num_common_causes': num_common_causes,
@ -77,6 +87,7 @@ class TestEstimator(object):
configs = [dict(zip(keys, v)) for v in itertools.product(*values)]
for cfg in configs:
print("\nConfig:", cfg)
cfg['dataset'] = dataset
cfg['method_params']= method_params
self.average_treatment_effect_test(**cfg)

Просмотреть файл

@ -9,7 +9,7 @@ class TestRegressionDiscontinuityEstimator(object):
"num_common_causes", "num_instruments",
"num_effect_modifiers", "num_treatments",
"treatment_is_binary", "outcome_is_binary"],
[(0.4, RegressionDiscontinuityEstimator, [0,1], [2,], [0,1], [1,], [True,], [False,]),])
[(0.2, RegressionDiscontinuityEstimator, [1], [1,], [0], [1,], [True,], [False,]),])
def test_average_treatment_effect(self, error_tolerance, Estimator,
num_common_causes, num_instruments, num_effect_modifiers,
num_treatments, treatment_is_binary, outcome_is_binary
@ -22,6 +22,7 @@ class TestRegressionDiscontinuityEstimator(object):
num_treatments=num_treatments,
treatment_is_binary=treatment_is_binary,
outcome_is_binary=outcome_is_binary,
method_params ={'rd_variable_name':'Z1',
dataset="simple-iv",
method_params ={'rd_variable_name':'Z0',
'rd_threshold_value':0.5,
'rd_bandwidth': 0.2})