From a1dcccbc805cb28aa4840fe9bce8338278632a50 Mon Sep 17 00:00:00 2001
From: Patrick Bloebaum <bloebp@amazon.com>
Date: Fri, 3 Mar 2023 09:07:44 -0800
Subject: [PATCH] Change one-hot-encoding behaviour in gcm module

Before, one dimension is dropped in the encoding. However, this requires to assume that there are no unknown categories, since these would be mapped to a zero vector as well, which then coincides with one of the categories. Now, there are as many dimensions as categories, which allow to map unknown categories to a zero vector.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
---
 dowhy/gcm/util/general.py      |  2 +-
 tests/gcm/util/test_general.py | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/dowhy/gcm/util/general.py b/dowhy/gcm/util/general.py
index c0a7a2018..920e41f87 100644
--- a/dowhy/gcm/util/general.py
+++ b/dowhy/gcm/util/general.py
@@ -61,7 +61,7 @@ def fit_one_hot_encoders(X: np.ndarray) -> Dict[int, OneHotEncoder]:
     one_hot_encoders = {}
     for column in range(X.shape[1]):
         if isinstance(X[0, column], str):
-            one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
+            one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore")
             one_hot_encoders[column].fit(X[:, column].reshape(-1, 1))
 
     return one_hot_encoders
diff --git a/tests/gcm/util/test_general.py b/tests/gcm/util/test_general.py
index 27584603f..d3c5e5b89 100644
--- a/tests/gcm/util/test_general.py
+++ b/tests/gcm/util/test_general.py
@@ -1,7 +1,8 @@
 import numpy as np
 import pandas as pd
+from _pytest.python_api import approx
 
-from dowhy.gcm.util.general import has_categorical, is_categorical
+from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders, has_categorical, is_categorical
 
 
 def test_given_categorical_data_when_evaluating_is_categorical_then_returns_expected_result():
@@ -18,3 +19,19 @@ def test_given_categorical_data_when_evaluating_has_categorical_then_returns_exp
     )
     assert has_categorical(pd.DataFrame({"X": [True, False, False], "Y": [1, 2, 3]}).to_numpy())
     assert not has_categorical(np.array([[1, 2, 3], [12.2, 2.3, 3.231]]))
+
+
+def test_given_categorical_data_when_fit_one_hot_encoders_and_apply_one_hot_encoding_then_returns_expected_feature_vector():
+    data = np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)
+    encoders = fit_one_hot_encoders(data)
+
+    assert apply_one_hot_encoding(data, encoders) == approx(
+        np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 2, 0, 1], [1, 0, 0, 3, 1, 0]])
+    )
+
+
+def test_given_unknown_categorical_input_when_apply_one_hot_encoders_then_does_not_raise_error():
+    assert apply_one_hot_encoding(
+        np.array([["a", 4, "f"]]),
+        fit_one_hot_encoders(np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)),
+    ) == approx(np.array([[1, 0, 0, 4, 0, 0]]))