From a1dcccbc805cb28aa4840fe9bce8338278632a50 Mon Sep 17 00:00:00 2001 From: Patrick Bloebaum Date: Fri, 3 Mar 2023 09:07:44 -0800 Subject: [PATCH] Change one-hot-encoding behaviour in gcm module Before, one dimension is dropped in the encoding. However, this requires to assume that there are no unknown categories, since these would be mapped to a zero vector as well, which then coincides with one of the categories. Now, there are as many dimensions as categories, which allow to map unknown categories to a zero vector. Signed-off-by: Patrick Bloebaum --- dowhy/gcm/util/general.py | 2 +- tests/gcm/util/test_general.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dowhy/gcm/util/general.py b/dowhy/gcm/util/general.py index c0a7a2018..920e41f87 100644 --- a/dowhy/gcm/util/general.py +++ b/dowhy/gcm/util/general.py @@ -61,7 +61,7 @@ def fit_one_hot_encoders(X: np.ndarray) -> Dict[int, OneHotEncoder]: one_hot_encoders = {} for column in range(X.shape[1]): if isinstance(X[0, column], str): - one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore", drop="if_binary") + one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore") one_hot_encoders[column].fit(X[:, column].reshape(-1, 1)) return one_hot_encoders diff --git a/tests/gcm/util/test_general.py b/tests/gcm/util/test_general.py index 27584603f..d3c5e5b89 100644 --- a/tests/gcm/util/test_general.py +++ b/tests/gcm/util/test_general.py @@ -1,7 +1,8 @@ import numpy as np import pandas as pd +from _pytest.python_api import approx -from dowhy.gcm.util.general import has_categorical, is_categorical +from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders, has_categorical, is_categorical def test_given_categorical_data_when_evaluating_is_categorical_then_returns_expected_result(): @@ -18,3 +19,19 @@ def test_given_categorical_data_when_evaluating_has_categorical_then_returns_exp ) assert has_categorical(pd.DataFrame({"X": [True, False, False], "Y": [1, 2, 3]}).to_numpy()) assert not has_categorical(np.array([[1, 2, 3], [12.2, 2.3, 3.231]])) + + +def test_given_categorical_data_when_fit_one_hot_encoders_and_apply_one_hot_encoding_then_returns_expected_feature_vector(): + data = np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object) + encoders = fit_one_hot_encoders(data) + + assert apply_one_hot_encoding(data, encoders) == approx( + np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 2, 0, 1], [1, 0, 0, 3, 1, 0]]) + ) + + +def test_given_unknown_categorical_input_when_apply_one_hot_encoders_then_does_not_raise_error(): + assert apply_one_hot_encoding( + np.array([["a", 4, "f"]]), + fit_one_hot_encoders(np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)), + ) == approx(np.array([[1, 0, 0, 4, 0, 0]]))