Change one-hot-encoding behaviour in gcm module

Before, one dimension is dropped in the encoding. However, this requires to assume that there are no unknown categories, since these would be mapped to a zero vector as well, which then coincides with one of the categories. Now, there are as many dimensions as categories, which allow to map unknown categories to a zero vector. Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
2023-03-03 09:07:44 -08:00 · 2023-03-03 09:07:44 -08:00 · a1dcccbc80
--- a/dowhy/gcm/util/general.py
+++ b/dowhy/gcm/util/general.py
@ -61,7 +61,7 @@ def fit_one_hot_encoders(X: np.ndarray) -> Dict[int, OneHotEncoder]:
    one_hot_encoders = {}
    for column in range(X.shape[1]):
        if isinstance(X[0, column], str):
-            one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
+            one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore")
            one_hot_encoders[column].fit(X[:, column].reshape(-1, 1))

    return one_hot_encoders
--- a/tests/gcm/util/test_general.py
+++ b/tests/gcm/util/test_general.py
@ -1,7 +1,8 @@
 import numpy as np
 import pandas as pd
+from _pytest.python_api import approx

-from dowhy.gcm.util.general import has_categorical, is_categorical
+from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders, has_categorical, is_categorical


 def test_given_categorical_data_when_evaluating_is_categorical_then_returns_expected_result():
@ -18,3 +19,19 @@ def test_given_categorical_data_when_evaluating_has_categorical_then_returns_exp
    )
    assert has_categorical(pd.DataFrame({"X": [True, False, False], "Y": [1, 2, 3]}).to_numpy())
    assert not has_categorical(np.array([[1, 2, 3], [12.2, 2.3, 3.231]]))
+
+
+def test_given_categorical_data_when_fit_one_hot_encoders_and_apply_one_hot_encoding_then_returns_expected_feature_vector():
+    data = np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)
+    encoders = fit_one_hot_encoders(data)
+
+    assert apply_one_hot_encoding(data, encoders) == approx(
+        np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 2, 0, 1], [1, 0, 0, 3, 1, 0]])
+    )
+
+
+def test_given_unknown_categorical_input_when_apply_one_hot_encoders_then_does_not_raise_error():
+    assert apply_one_hot_encoding(
+        np.array([["a", 4, "f"]]),
+        fit_one_hot_encoders(np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)),
+    ) == approx(np.array([[1, 0, 0, 4, 0, 0]]))