Fix pandas FutureWarnings in calls to .groupby() (#1164)
* added observed=False argument to all calls to .groupby() to retain current behavior and silence the pandas FutureWarning Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> * reverted notebook file to remote version to remove changes to the 'data' field Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> * updated only the line required in the notebook to silence to FutureWarning Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> --------- Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com>
This commit is contained in:
Родитель
e0838cc291
Коммит
37dd304fcf
|
@ -251,7 +251,7 @@ you can use the namespace as follows.
|
||||||
data['df'].causal.do(x='v0', # name of treatment variable
|
data['df'].causal.do(x='v0', # name of treatment variable
|
||||||
variable_types={'v0': 'b', 'y': 'c', 'W0': 'c'},
|
variable_types={'v0': 'b', 'y': 'c', 'W0': 'c'},
|
||||||
outcome='y',
|
outcome='y',
|
||||||
common_causes=['W0']).groupby('v0').mean().plot(y='y', kind='bar')
|
common_causes=['W0']).groupby('v0', observed=False).mean().plot(y='y', kind='bar')
|
||||||
|
|
||||||
.. image:: https://raw.githubusercontent.com/microsoft/dowhy/main/docs/images/do_barplot.png
|
.. image:: https://raw.githubusercontent.com/microsoft/dowhy/main/docs/images/do_barplot.png
|
||||||
|
|
||||||
|
|
|
@ -154,7 +154,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"dataset = dataset[dataset.deposit_type==\"No Deposit\"]\n",
|
"dataset = dataset[dataset.deposit_type==\"No Deposit\"]\n",
|
||||||
"dataset.groupby(['deposit_type','is_canceled']).count()"
|
"dataset.groupby(['deposit_type','is_canceled'], observed=False).count()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -55,7 +55,7 @@
|
||||||
" variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},\n",
|
" variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},\n",
|
||||||
" outcome=outcome,\n",
|
" outcome=outcome,\n",
|
||||||
" common_causes=[common_cause],\n",
|
" common_causes=[common_cause],\n",
|
||||||
" proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
|
" proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -69,7 +69,7 @@
|
||||||
" outcome=outcome,\n",
|
" outcome=outcome,\n",
|
||||||
" method='weighting', \n",
|
" method='weighting', \n",
|
||||||
" common_causes=[common_cause],\n",
|
" common_causes=[common_cause],\n",
|
||||||
" proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
|
" proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -281,7 +281,7 @@
|
||||||
"# For each customer, determine their average monthly spend before and after month i\n",
|
"# For each customer, determine their average monthly spend before and after month i\n",
|
||||||
"df_i_signupmonth = (\n",
|
"df_i_signupmonth = (\n",
|
||||||
" df[df.signup_month.isin([0, i])]\n",
|
" df[df.signup_month.isin([0, i])]\n",
|
||||||
" .groupby([\"user_id\", \"signup_month\", \"treatment\"])\n",
|
" .groupby([\"user_id\", \"signup_month\", \"treatment\"], observed=False)\n",
|
||||||
" .apply(\n",
|
" .apply(\n",
|
||||||
" lambda x: pd.Series(\n",
|
" lambda x: pd.Series(\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
|
|
|
@ -101,7 +101,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"data.groupby(['week']).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
|
"data.groupby(['week'], observed=False).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -142,7 +142,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"data.groupby(['week']).mean().plot(kind='bar', title='average', legend=True);"
|
"data.groupby(['week'], observed=False).mean().plot(kind='bar', title='average', legend=True);"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -234,7 +234,7 @@ class CausalEstimator:
|
||||||
data[prefix + str(em)] = pd.qcut(data[em], num_quantiles, duplicates="drop")
|
data[prefix + str(em)] = pd.qcut(data[em], num_quantiles, duplicates="drop")
|
||||||
effect_modifier_names[i] = prefix + str(em)
|
effect_modifier_names[i] = prefix + str(em)
|
||||||
# Grouping by effect modifiers and computing effect separately
|
# Grouping by effect modifiers and computing effect separately
|
||||||
by_effect_mods = data.groupby(effect_modifier_names)
|
by_effect_mods = data.groupby(effect_modifier_names, observed=False)
|
||||||
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
|
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
|
||||||
conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
|
conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
|
||||||
# Deleting the temporary categorical columns
|
# Deleting the temporary categorical columns
|
||||||
|
|
|
@ -219,7 +219,7 @@ class DistanceMatchingEstimator(CausalEstimator):
|
||||||
for i in range(numtreatedunits):
|
for i in range(numtreatedunits):
|
||||||
self.matched_indices_att[treated_df_index[i]] = control.iloc[indices[i]].index.tolist()
|
self.matched_indices_att[treated_df_index[i]] = control.iloc[indices[i]].index.tolist()
|
||||||
else:
|
else:
|
||||||
grouped = updated_df.groupby(self.exact_match_cols)
|
grouped = updated_df.groupby(self.exact_match_cols, observed=False)
|
||||||
att = 0
|
att = 0
|
||||||
for name, group in grouped:
|
for name, group in grouped:
|
||||||
treated = group.loc[group[self._target_estimand.treatment_variable[0]] == 1]
|
treated = group.loc[group[self._target_estimand.treatment_variable[0]] == 1]
|
||||||
|
|
|
@ -140,7 +140,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator):
|
||||||
num_strata,
|
num_strata,
|
||||||
self.clipping_threshold,
|
self.clipping_threshold,
|
||||||
)
|
)
|
||||||
num_ret_strata = clipped.groupby(["strata"]).count().reset_index()
|
num_ret_strata = clipped.groupby(["strata"], observed=False).count().reset_index()
|
||||||
# At least 90% of the strata should be included in analysis
|
# At least 90% of the strata should be included in analysis
|
||||||
if num_ret_strata.shape[0] >= 0.5 * num_strata:
|
if num_ret_strata.shape[0] >= 0.5 * num_strata:
|
||||||
strata_found = True
|
strata_found = True
|
||||||
|
@ -172,7 +172,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator):
|
||||||
)
|
)
|
||||||
|
|
||||||
# sum weighted outcomes over all strata (weight by treated population)
|
# sum weighted outcomes over all strata (weight by treated population)
|
||||||
weighted_outcomes = clipped.groupby("strata").agg(
|
weighted_outcomes = clipped.groupby("strata", observed=False).agg(
|
||||||
{self._target_estimand.treatment_variable[0]: ["sum"], "dbar": ["sum"], "d_y": ["sum"], "dbar_y": ["sum"]}
|
{self._target_estimand.treatment_variable[0]: ["sum"], "dbar": ["sum"], "d_y": ["sum"], "dbar_y": ["sum"]}
|
||||||
)
|
)
|
||||||
weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()]
|
weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()]
|
||||||
|
@ -233,7 +233,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator):
|
||||||
data[self._target_estimand.treatment_variable[0]] * data[self._target_estimand.outcome_variable[0]]
|
data[self._target_estimand.treatment_variable[0]] * data[self._target_estimand.outcome_variable[0]]
|
||||||
)
|
)
|
||||||
data["dbar_y"] = data["dbar"] * data[self._target_estimand.outcome_variable[0]]
|
data["dbar_y"] = data["dbar"] * data[self._target_estimand.outcome_variable[0]]
|
||||||
stratified = data.groupby("strata")
|
stratified = data.groupby("strata", observed=False)
|
||||||
clipped = stratified.filter(
|
clipped = stratified.filter(
|
||||||
lambda strata: min(
|
lambda strata: min(
|
||||||
strata.loc[strata[self._target_estimand.treatment_variable[0]] == 1].shape[0],
|
strata.loc[strata[self._target_estimand.treatment_variable[0]] == 1].shape[0],
|
||||||
|
@ -244,7 +244,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator):
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format(
|
"After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format(
|
||||||
clipping_threshold,
|
clipping_threshold,
|
||||||
clipped.groupby(["strata", self._target_estimand.treatment_variable[0]])[
|
clipped.groupby(["strata", self._target_estimand.treatment_variable[0]], observed=False)[
|
||||||
self._target_estimand.outcome_variable
|
self._target_estimand.outcome_variable
|
||||||
].count(),
|
].count(),
|
||||||
)
|
)
|
||||||
|
|
|
@ -748,7 +748,7 @@ def preprocess_data_by_treatment(
|
||||||
variable_type = data[treatment_variable_name].dtypes
|
variable_type = data[treatment_variable_name].dtypes
|
||||||
|
|
||||||
if bool == variable_type:
|
if bool == variable_type:
|
||||||
groups = data.groupby(treatment_variable_name)
|
groups = data.groupby(treatment_variable_name, observed=False)
|
||||||
return groups
|
return groups
|
||||||
# We use string arguments to account for both 32 and 64 bit varaibles
|
# We use string arguments to account for both 32 and 64 bit varaibles
|
||||||
elif "float" in variable_type.name or "int" in variable_type.name:
|
elif "float" in variable_type.name or "int" in variable_type.name:
|
||||||
|
@ -757,14 +757,14 @@ def preprocess_data_by_treatment(
|
||||||
std_dev = data[treatment_variable_name].std()
|
std_dev = data[treatment_variable_name].std()
|
||||||
num_bins = (data.max() - data.min()) / (bucket_size_scale_factor * std_dev)
|
num_bins = (data.max() - data.min()) / (bucket_size_scale_factor * std_dev)
|
||||||
data["bins"] = pd.cut(data[treatment_variable_name], num_bins)
|
data["bins"] = pd.cut(data[treatment_variable_name], num_bins)
|
||||||
groups = data.groupby("bins")
|
groups = data.groupby("bins", observed=False)
|
||||||
data.drop("bins", axis=1, inplace=True)
|
data.drop("bins", axis=1, inplace=True)
|
||||||
return groups
|
return groups
|
||||||
|
|
||||||
elif "categorical" in variable_type.name:
|
elif "categorical" in variable_type.name:
|
||||||
# Action for categorical variables
|
# Action for categorical variables
|
||||||
groups = data.groupby(treatment_variable_name)
|
groups = data.groupby(treatment_variable_name, observed=False)
|
||||||
groups = data.groupby("bins")
|
groups = data.groupby("bins", observed=False)
|
||||||
return groups
|
return groups
|
||||||
else:
|
else:
|
||||||
raise ValueError("Passed {}. Expected bool, float, int or categorical.".format(variable_type.name))
|
raise ValueError("Passed {}. Expected bool, float, int or categorical.".format(variable_type.name))
|
||||||
|
|
|
@ -81,10 +81,10 @@ class ConfounderDistributionInterpreter(VisualInterpreter):
|
||||||
|
|
||||||
# before weights are applied we count number rows in each category
|
# before weights are applied we count number rows in each category
|
||||||
# which is equivalent to summing over weight=1
|
# which is equivalent to summing over weight=1
|
||||||
barplot_df_before = df.groupby([self.var_name, treated]).size().reset_index(name="count")
|
barplot_df_before = df.groupby([self.var_name, treated], observed=False).size().reset_index(name="count")
|
||||||
|
|
||||||
# after weights are applied we need to sum over the given weights
|
# after weights are applied we need to sum over the given weights
|
||||||
barplot_df_after = df.groupby([self.var_name, treated]).agg({"weight": np.sum}).reset_index()
|
barplot_df_after = df.groupby([self.var_name, treated], observed=False).agg({"weight": np.sum}).reset_index()
|
||||||
barplot_df_after.rename(columns={"weight": "count"}, inplace=True)
|
barplot_df_after.rename(columns={"weight": "count"}, inplace=True)
|
||||||
|
|
||||||
title1 = "Distribution of " + self.var_name + " before applying the weights"
|
title1 = "Distribution of " + self.var_name + " before applying the weights"
|
||||||
|
|
|
@ -41,38 +41,38 @@ class PropensityBalanceInterpreter(VisualInterpreter):
|
||||||
)
|
)
|
||||||
|
|
||||||
# First, calculating mean differences by strata
|
# First, calculating mean differences by strata
|
||||||
mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"]).agg(
|
mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"], observed=False).agg(
|
||||||
mean_w=("W", np.mean)
|
mean_w=("W", np.mean)
|
||||||
)
|
)
|
||||||
mean_diff = (
|
mean_diff = (
|
||||||
mean_diff.groupby(["common_cause_id", "strata"]).transform(lambda x: x.max() - x.min()).reset_index()
|
mean_diff.groupby(["common_cause_id", "strata"], observed=False).transform(lambda x: x.max() - x.min()).reset_index()
|
||||||
)
|
)
|
||||||
mean_diff = mean_diff.query("v0==True")
|
mean_diff = mean_diff.query("v0==True")
|
||||||
size_by_w_strata = (
|
size_by_w_strata = (
|
||||||
df_long.groupby(["common_cause_id", "strata"]).agg(size=("propensity_score", np.size)).reset_index()
|
df_long.groupby(["common_cause_id", "strata"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
|
||||||
)
|
)
|
||||||
size_by_strata = df_long.groupby(["common_cause_id"]).agg(size=("propensity_score", np.size)).reset_index()
|
size_by_strata = df_long.groupby(["common_cause_id"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
|
||||||
size_by_strata = pd.merge(size_by_w_strata, size_by_strata, on="common_cause_id")
|
size_by_strata = pd.merge(size_by_w_strata, size_by_strata, on="common_cause_id")
|
||||||
mean_diff_strata = pd.merge(mean_diff, size_by_strata, on=("common_cause_id", "strata"))
|
mean_diff_strata = pd.merge(mean_diff, size_by_strata, on=("common_cause_id", "strata"))
|
||||||
|
|
||||||
stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"]).agg(stddev=("W", np.std)).reset_index()
|
stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"], observed=False).agg(stddev=("W", np.std)).reset_index()
|
||||||
mean_diff_strata = pd.merge(mean_diff_strata, stddev_by_w_strata, on=["common_cause_id", "strata"])
|
mean_diff_strata = pd.merge(mean_diff_strata, stddev_by_w_strata, on=["common_cause_id", "strata"])
|
||||||
mean_diff_strata["scaled_mean"] = (mean_diff_strata["mean_w"] / mean_diff_strata["stddev"]) * (
|
mean_diff_strata["scaled_mean"] = (mean_diff_strata["mean_w"] / mean_diff_strata["stddev"]) * (
|
||||||
mean_diff_strata["size_x"] / mean_diff_strata["size_y"]
|
mean_diff_strata["size_x"] / mean_diff_strata["size_y"]
|
||||||
)
|
)
|
||||||
mean_diff_strata = (
|
mean_diff_strata = (
|
||||||
mean_diff_strata.groupby("common_cause_id").agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
|
mean_diff_strata.groupby("common_cause_id", observed=False).agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Second, without strata
|
# Second, without strata
|
||||||
mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"]).agg(
|
mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"], observed=False).agg(
|
||||||
mean_w=("W", np.mean)
|
mean_w=("W", np.mean)
|
||||||
)
|
)
|
||||||
mean_diff_overall = (
|
mean_diff_overall = (
|
||||||
mean_diff_overall.groupby("common_cause_id").transform(lambda x: x.max() - x.min()).reset_index()
|
mean_diff_overall.groupby("common_cause_id", observed=False).transform(lambda x: x.max() - x.min()).reset_index()
|
||||||
)
|
)
|
||||||
mean_diff_overall = mean_diff_overall[mean_diff_overall[self.estimate._treatment_name[0]] == True] # TODO
|
mean_diff_overall = mean_diff_overall[mean_diff_overall[self.estimate._treatment_name[0]] == True] # TODO
|
||||||
stddev_overall = df_long.groupby(["common_cause_id"]).agg(stddev=("W", np.std)).reset_index()
|
stddev_overall = df_long.groupby(["common_cause_id"], observed=False).agg(stddev=("W", np.std)).reset_index()
|
||||||
mean_diff_overall = pd.merge(mean_diff_overall, stddev_overall, on=["common_cause_id"])
|
mean_diff_overall = pd.merge(mean_diff_overall, stddev_overall, on=["common_cause_id"])
|
||||||
mean_diff_overall["std_mean_diff"] = mean_diff_overall["mean_w"] / mean_diff_overall["stddev"]
|
mean_diff_overall["std_mean_diff"] = mean_diff_overall["mean_w"] / mean_diff_overall["stddev"]
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ class PropensityBalanceInterpreter(VisualInterpreter):
|
||||||
|
|
||||||
plt.style.use("seaborn-white")
|
plt.style.use("seaborn-white")
|
||||||
fig, ax = plt.subplots(1, 1)
|
fig, ax = plt.subplots(1, 1)
|
||||||
for label, subdf in plot_df.groupby("common_cause_id"):
|
for label, subdf in plot_df.groupby("common_cause_id", observed=False):
|
||||||
subdf.plot(kind="line", x="sample", y="std_mean_diff", ax=ax, label=label)
|
subdf.plot(kind="line", x="sample", y="std_mean_diff", ax=ax, label=label)
|
||||||
plt.legend(title="Common causes")
|
plt.legend(title="Common causes")
|
||||||
plt.ylabel("Standardized mean difference between treatment and control")
|
plt.ylabel("Standardized mean difference between treatment and control")
|
||||||
|
|
|
@ -200,7 +200,7 @@ class TestPandasDoAPI(object):
|
||||||
|
|
||||||
data["df"].causal.do(
|
data["df"].causal.do(
|
||||||
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
||||||
).groupby("v0").mean()
|
).groupby("v0", observed=False).mean()
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
@mark.parametrize(
|
@mark.parametrize(
|
||||||
|
@ -216,7 +216,7 @@ class TestPandasDoAPI(object):
|
||||||
|
|
||||||
data["df"].causal.do(
|
data["df"].causal.do(
|
||||||
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
||||||
).groupby("v0").mean()
|
).groupby("v0", observed=False).mean()
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
@mark.parametrize(
|
@mark.parametrize(
|
||||||
|
@ -232,7 +232,7 @@ class TestPandasDoAPI(object):
|
||||||
|
|
||||||
data["df"].causal.do(
|
data["df"].causal.do(
|
||||||
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
|
||||||
).groupby("v0").mean()
|
).groupby("v0", observed=False).mean()
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
@mark.parametrize(
|
@mark.parametrize(
|
||||||
|
|
Загрузка…
Ссылка в новой задаче