Vectorize operations for propensity score matching (#1179)

* Add vector operations Added todo comment Signed-off-by: Rahul Shrestha <rahulshrestha0101@gmail.com> formatting fix Signed-off-by: Rahul Shrestha <rahulshrestha0101@gmail.com> bug fix with string name Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com> * Vectorize remaining list Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com> --------- Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com>
2024-06-04 12:20:40 +02:00 · 2024-06-04 12:20:40 +02:00 · 72e3ba055e
--- a/dowhy/causal_estimators/propensity_score_matching_estimator.py
+++ b/dowhy/causal_estimators/propensity_score_matching_estimator.py
@ -1,5 +1,6 @@
 from typing import Any, List, Optional, Union

+import numpy as np
 import pandas as pd
 from sklearn.neighbors import NearestNeighbors

@ -120,7 +121,7 @@ class PropensityScoreMatchingEstimator(PropensityScoreEstimator):

        # TODO remove neighbors that are more than a given radius apart

-        # estimate ATT on treated by summing over difference between matched neighbors
+        # Estimating ATT on treated by summing over difference between matched neighbors
        control_neighbors = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(
            control[self.propensity_score_column].values.reshape(-1, 1)
        )
@ -129,27 +130,28 @@ class PropensityScoreMatchingEstimator(PropensityScoreEstimator):
        self.logger.debug(distances)

        att = 0
-        numtreatedunits = treated.shape[0]
-        for i in range(numtreatedunits):
-            treated_outcome = treated.iloc[i][self._target_estimand.outcome_variable[0]].item()
-            control_outcome = control.iloc[indices[i]][self._target_estimand.outcome_variable[0]].item()
-            att += treated_outcome - control_outcome
+        outcome_variable = self._target_estimand.outcome_variable[0]
+        treated_outcomes = treated[outcome_variable]
+        control_outcomes = list(control.iloc[indices.flatten()][outcome_variable])

-        att /= numtreatedunits
+        att = (treated_outcomes - control_outcomes).mean()
+
+        # Estimating ATC

-        # Now computing ATC
        treated_neighbors = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(
            treated[self.propensity_score_column].values.reshape(-1, 1)
        )
        distances, indices = treated_neighbors.kneighbors(control[self.propensity_score_column].values.reshape(-1, 1))
-        atc = 0
-        numcontrolunits = control.shape[0]
-        for i in range(numcontrolunits):
-            control_outcome = control.iloc[i][self._target_estimand.outcome_variable[0]].item()
-            treated_outcome = treated.iloc[indices[i]][self._target_estimand.outcome_variable[0]].item()
-            atc += treated_outcome - control_outcome

-        atc /= numcontrolunits
+        atc = 0
+        outcome_variable = self._target_estimand.outcome_variable[0]
+        control_outcomes = control[outcome_variable]
+        treated_outcomes = list(treated.iloc[indices.flatten()][outcome_variable])
+
+        atc = (treated_outcomes - control_outcomes).mean()
+
+        numtreatedunits = treated.shape[0]
+        numcontrolunits = control.shape[0]

        if target_units == "att":
            est = att