From 37dd304fcf3554df7d961864785b8f493db5a968 Mon Sep 17 00:00:00 2001 From: Dylan W <139509928+dw-610@users.noreply.github.com> Date: Mon, 22 Apr 2024 11:06:21 -0500 Subject: [PATCH] Fix pandas FutureWarnings in calls to .groupby() (#1164) * added observed=False argument to all calls to .groupby() to retain current behavior and silence the pandas FutureWarning Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> * reverted notebook file to remote version to remove changes to the 'data' field Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> * updated only the line required in the notebook to silence to FutureWarning Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> --------- Signed-off-by: Dylan W <139509928+dw-610@users.noreply.github.com> --- README.rst | 2 +- ...y Behind Hotel Booking Cancellations.ipynb | 2 +- .../example_notebooks/dowhy_causal_api.ipynb | 4 ++-- ...mple_effect_of_memberrewards_program.ipynb | 2 +- .../gcm_supply_chain_dist_change.ipynb | 4 ++-- dowhy/causal_estimator.py | 2 +- .../distance_matching_estimator.py | 2 +- ...opensity_score_stratification_estimator.py | 8 ++++---- .../causal_refuters/dummy_outcome_refuter.py | 8 ++++---- .../confounder_distribution_interpreter.py | 4 ++-- .../propensity_balance_interpreter.py | 20 +++++++++---------- tests/do_sampler/test_pandas_do_api.py | 6 +++--- 12 files changed, 32 insertions(+), 32 deletions(-) diff --git a/README.rst b/README.rst index c87a2f153..ab6b0066b 100755 --- a/README.rst +++ b/README.rst @@ -251,7 +251,7 @@ you can use the namespace as follows. data['df'].causal.do(x='v0', # name of treatment variable variable_types={'v0': 'b', 'y': 'c', 'W0': 'c'}, outcome='y', - common_causes=['W0']).groupby('v0').mean().plot(y='y', kind='bar') + common_causes=['W0']).groupby('v0', observed=False).mean().plot(y='y', kind='bar') .. image:: https://raw.githubusercontent.com/microsoft/dowhy/main/docs/images/do_barplot.png diff --git a/docs/source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb b/docs/source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb index 40af4d487..f7472bcfe 100644 --- a/docs/source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb +++ b/docs/source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb @@ -154,7 +154,7 @@ "outputs": [], "source": [ "dataset = dataset[dataset.deposit_type==\"No Deposit\"]\n", - "dataset.groupby(['deposit_type','is_canceled']).count()" + "dataset.groupby(['deposit_type','is_canceled'], observed=False).count()" ] }, { diff --git a/docs/source/example_notebooks/dowhy_causal_api.ipynb b/docs/source/example_notebooks/dowhy_causal_api.ipynb index decae0f41..960dec44c 100644 --- a/docs/source/example_notebooks/dowhy_causal_api.ipynb +++ b/docs/source/example_notebooks/dowhy_causal_api.ipynb @@ -55,7 +55,7 @@ " variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},\n", " outcome=outcome,\n", " common_causes=[common_cause],\n", - " proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')" + " proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')" ] }, { @@ -69,7 +69,7 @@ " outcome=outcome,\n", " method='weighting', \n", " common_causes=[common_cause],\n", - " proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')" + " proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')" ] }, { diff --git a/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb b/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb index 258b006c1..9dd99a67c 100644 --- a/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb +++ b/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb @@ -281,7 +281,7 @@ "# For each customer, determine their average monthly spend before and after month i\n", "df_i_signupmonth = (\n", " df[df.signup_month.isin([0, i])]\n", - " .groupby([\"user_id\", \"signup_month\", \"treatment\"])\n", + " .groupby([\"user_id\", \"signup_month\", \"treatment\"], observed=False)\n", " .apply(\n", " lambda x: pd.Series(\n", " {\n", diff --git a/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb b/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb index 7abc43741..7565e0261 100644 --- a/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb +++ b/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb @@ -101,7 +101,7 @@ }, "outputs": [], "source": [ - "data.groupby(['week']).mean()[['received']].plot(kind='bar', title='average received', legend=False); " + "data.groupby(['week'], observed=False).mean()[['received']].plot(kind='bar', title='average received', legend=False); " ] }, { @@ -142,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "data.groupby(['week']).mean().plot(kind='bar', title='average', legend=True);" + "data.groupby(['week'], observed=False).mean().plot(kind='bar', title='average', legend=True);" ] }, { diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py index cc10ce8e0..53a5efc35 100755 --- a/dowhy/causal_estimator.py +++ b/dowhy/causal_estimator.py @@ -234,7 +234,7 @@ class CausalEstimator: data[prefix + str(em)] = pd.qcut(data[em], num_quantiles, duplicates="drop") effect_modifier_names[i] = prefix + str(em) # Grouping by effect modifiers and computing effect separately - by_effect_mods = data.groupby(effect_modifier_names) + by_effect_mods = data.groupby(effect_modifier_names, observed=False) cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x) conditional_estimates = by_effect_mods.apply(estimate_effect_fn) # Deleting the temporary categorical columns diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py index 881041a5a..1cd99d8a9 100644 --- a/dowhy/causal_estimators/distance_matching_estimator.py +++ b/dowhy/causal_estimators/distance_matching_estimator.py @@ -219,7 +219,7 @@ class DistanceMatchingEstimator(CausalEstimator): for i in range(numtreatedunits): self.matched_indices_att[treated_df_index[i]] = control.iloc[indices[i]].index.tolist() else: - grouped = updated_df.groupby(self.exact_match_cols) + grouped = updated_df.groupby(self.exact_match_cols, observed=False) att = 0 for name, group in grouped: treated = group.loc[group[self._target_estimand.treatment_variable[0]] == 1] diff --git a/dowhy/causal_estimators/propensity_score_stratification_estimator.py b/dowhy/causal_estimators/propensity_score_stratification_estimator.py index 42c1c5a70..abc4f836a 100755 --- a/dowhy/causal_estimators/propensity_score_stratification_estimator.py +++ b/dowhy/causal_estimators/propensity_score_stratification_estimator.py @@ -140,7 +140,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator): num_strata, self.clipping_threshold, ) - num_ret_strata = clipped.groupby(["strata"]).count().reset_index() + num_ret_strata = clipped.groupby(["strata"], observed=False).count().reset_index() # At least 90% of the strata should be included in analysis if num_ret_strata.shape[0] >= 0.5 * num_strata: strata_found = True @@ -172,7 +172,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator): ) # sum weighted outcomes over all strata (weight by treated population) - weighted_outcomes = clipped.groupby("strata").agg( + weighted_outcomes = clipped.groupby("strata", observed=False).agg( {self._target_estimand.treatment_variable[0]: ["sum"], "dbar": ["sum"], "d_y": ["sum"], "dbar_y": ["sum"]} ) weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()] @@ -233,7 +233,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator): data[self._target_estimand.treatment_variable[0]] * data[self._target_estimand.outcome_variable[0]] ) data["dbar_y"] = data["dbar"] * data[self._target_estimand.outcome_variable[0]] - stratified = data.groupby("strata") + stratified = data.groupby("strata", observed=False) clipped = stratified.filter( lambda strata: min( strata.loc[strata[self._target_estimand.treatment_variable[0]] == 1].shape[0], @@ -244,7 +244,7 @@ class PropensityScoreStratificationEstimator(PropensityScoreEstimator): self.logger.debug( "After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format( clipping_threshold, - clipped.groupby(["strata", self._target_estimand.treatment_variable[0]])[ + clipped.groupby(["strata", self._target_estimand.treatment_variable[0]], observed=False)[ self._target_estimand.outcome_variable ].count(), ) diff --git a/dowhy/causal_refuters/dummy_outcome_refuter.py b/dowhy/causal_refuters/dummy_outcome_refuter.py index a4a529b2e..c97ceb2df 100644 --- a/dowhy/causal_refuters/dummy_outcome_refuter.py +++ b/dowhy/causal_refuters/dummy_outcome_refuter.py @@ -748,7 +748,7 @@ def preprocess_data_by_treatment( variable_type = data[treatment_variable_name].dtypes if bool == variable_type: - groups = data.groupby(treatment_variable_name) + groups = data.groupby(treatment_variable_name, observed=False) return groups # We use string arguments to account for both 32 and 64 bit varaibles elif "float" in variable_type.name or "int" in variable_type.name: @@ -757,14 +757,14 @@ def preprocess_data_by_treatment( std_dev = data[treatment_variable_name].std() num_bins = (data.max() - data.min()) / (bucket_size_scale_factor * std_dev) data["bins"] = pd.cut(data[treatment_variable_name], num_bins) - groups = data.groupby("bins") + groups = data.groupby("bins", observed=False) data.drop("bins", axis=1, inplace=True) return groups elif "categorical" in variable_type.name: # Action for categorical variables - groups = data.groupby(treatment_variable_name) - groups = data.groupby("bins") + groups = data.groupby(treatment_variable_name, observed=False) + groups = data.groupby("bins", observed=False) return groups else: raise ValueError("Passed {}. Expected bool, float, int or categorical.".format(variable_type.name)) diff --git a/dowhy/interpreters/confounder_distribution_interpreter.py b/dowhy/interpreters/confounder_distribution_interpreter.py index eac67166b..4a15b119f 100644 --- a/dowhy/interpreters/confounder_distribution_interpreter.py +++ b/dowhy/interpreters/confounder_distribution_interpreter.py @@ -81,10 +81,10 @@ class ConfounderDistributionInterpreter(VisualInterpreter): # before weights are applied we count number rows in each category # which is equivalent to summing over weight=1 - barplot_df_before = df.groupby([self.var_name, treated]).size().reset_index(name="count") + barplot_df_before = df.groupby([self.var_name, treated], observed=False).size().reset_index(name="count") # after weights are applied we need to sum over the given weights - barplot_df_after = df.groupby([self.var_name, treated]).agg({"weight": np.sum}).reset_index() + barplot_df_after = df.groupby([self.var_name, treated], observed=False).agg({"weight": np.sum}).reset_index() barplot_df_after.rename(columns={"weight": "count"}, inplace=True) title1 = "Distribution of " + self.var_name + " before applying the weights" diff --git a/dowhy/interpreters/propensity_balance_interpreter.py b/dowhy/interpreters/propensity_balance_interpreter.py index 6b21150e8..b6510e7bc 100644 --- a/dowhy/interpreters/propensity_balance_interpreter.py +++ b/dowhy/interpreters/propensity_balance_interpreter.py @@ -41,38 +41,38 @@ class PropensityBalanceInterpreter(VisualInterpreter): ) # First, calculating mean differences by strata - mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"]).agg( + mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"], observed=False).agg( mean_w=("W", np.mean) ) mean_diff = ( - mean_diff.groupby(["common_cause_id", "strata"]).transform(lambda x: x.max() - x.min()).reset_index() + mean_diff.groupby(["common_cause_id", "strata"], observed=False).transform(lambda x: x.max() - x.min()).reset_index() ) mean_diff = mean_diff.query("v0==True") size_by_w_strata = ( - df_long.groupby(["common_cause_id", "strata"]).agg(size=("propensity_score", np.size)).reset_index() + df_long.groupby(["common_cause_id", "strata"], observed=False).agg(size=("propensity_score", np.size)).reset_index() ) - size_by_strata = df_long.groupby(["common_cause_id"]).agg(size=("propensity_score", np.size)).reset_index() + size_by_strata = df_long.groupby(["common_cause_id"], observed=False).agg(size=("propensity_score", np.size)).reset_index() size_by_strata = pd.merge(size_by_w_strata, size_by_strata, on="common_cause_id") mean_diff_strata = pd.merge(mean_diff, size_by_strata, on=("common_cause_id", "strata")) - stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"]).agg(stddev=("W", np.std)).reset_index() + stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"], observed=False).agg(stddev=("W", np.std)).reset_index() mean_diff_strata = pd.merge(mean_diff_strata, stddev_by_w_strata, on=["common_cause_id", "strata"]) mean_diff_strata["scaled_mean"] = (mean_diff_strata["mean_w"] / mean_diff_strata["stddev"]) * ( mean_diff_strata["size_x"] / mean_diff_strata["size_y"] ) mean_diff_strata = ( - mean_diff_strata.groupby("common_cause_id").agg(std_mean_diff=("scaled_mean", np.sum)).reset_index() + mean_diff_strata.groupby("common_cause_id", observed=False).agg(std_mean_diff=("scaled_mean", np.sum)).reset_index() ) # Second, without strata - mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"]).agg( + mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"], observed=False).agg( mean_w=("W", np.mean) ) mean_diff_overall = ( - mean_diff_overall.groupby("common_cause_id").transform(lambda x: x.max() - x.min()).reset_index() + mean_diff_overall.groupby("common_cause_id", observed=False).transform(lambda x: x.max() - x.min()).reset_index() ) mean_diff_overall = mean_diff_overall[mean_diff_overall[self.estimate._treatment_name[0]] == True] # TODO - stddev_overall = df_long.groupby(["common_cause_id"]).agg(stddev=("W", np.std)).reset_index() + stddev_overall = df_long.groupby(["common_cause_id"], observed=False).agg(stddev=("W", np.std)).reset_index() mean_diff_overall = pd.merge(mean_diff_overall, stddev_overall, on=["common_cause_id"]) mean_diff_overall["std_mean_diff"] = mean_diff_overall["mean_w"] / mean_diff_overall["stddev"] @@ -86,7 +86,7 @@ class PropensityBalanceInterpreter(VisualInterpreter): plt.style.use("seaborn-white") fig, ax = plt.subplots(1, 1) - for label, subdf in plot_df.groupby("common_cause_id"): + for label, subdf in plot_df.groupby("common_cause_id", observed=False): subdf.plot(kind="line", x="sample", y="std_mean_diff", ax=ax, label=label) plt.legend(title="Common causes") plt.ylabel("Standardized mean difference between treatment and control") diff --git a/tests/do_sampler/test_pandas_do_api.py b/tests/do_sampler/test_pandas_do_api.py index 00fa101e7..0c8570a5c 100755 --- a/tests/do_sampler/test_pandas_do_api.py +++ b/tests/do_sampler/test_pandas_do_api.py @@ -200,7 +200,7 @@ class TestPandasDoAPI(object): data["df"].causal.do( x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"] - ).groupby("v0").mean() + ).groupby("v0", observed=False).mean() assert True @mark.parametrize( @@ -216,7 +216,7 @@ class TestPandasDoAPI(object): data["df"].causal.do( x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"] - ).groupby("v0").mean() + ).groupby("v0", observed=False).mean() assert True @mark.parametrize( @@ -232,7 +232,7 @@ class TestPandasDoAPI(object): data["df"].causal.do( x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"] - ).groupby("v0").mean() + ).groupby("v0", observed=False).mean() assert True @mark.parametrize(