Change one-hot-encoding behaviour in gcm module

Before, one dimension is dropped in the encoding. However, this requires to assume that there are no unknown categories, since these would be mapped to a zero vector as well, which then coincides with one of the categories. Now, there are as many dimensions as categories, which allow to map unknown categories to a zero vector.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
This commit is contained in:
Patrick Bloebaum 2023-03-03 09:07:44 -08:00 коммит произвёл Patrick Blöbaum
Родитель b1e5804904
Коммит a1dcccbc80
2 изменённых файлов: 19 добавлений и 2 удалений

Просмотреть файл

@ -61,7 +61,7 @@ def fit_one_hot_encoders(X: np.ndarray) -> Dict[int, OneHotEncoder]:
one_hot_encoders = {}
for column in range(X.shape[1]):
if isinstance(X[0, column], str):
one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore")
one_hot_encoders[column].fit(X[:, column].reshape(-1, 1))
return one_hot_encoders

Просмотреть файл

@ -1,7 +1,8 @@
import numpy as np
import pandas as pd
from _pytest.python_api import approx
from dowhy.gcm.util.general import has_categorical, is_categorical
from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders, has_categorical, is_categorical
def test_given_categorical_data_when_evaluating_is_categorical_then_returns_expected_result():
@ -18,3 +19,19 @@ def test_given_categorical_data_when_evaluating_has_categorical_then_returns_exp
)
assert has_categorical(pd.DataFrame({"X": [True, False, False], "Y": [1, 2, 3]}).to_numpy())
assert not has_categorical(np.array([[1, 2, 3], [12.2, 2.3, 3.231]]))
def test_given_categorical_data_when_fit_one_hot_encoders_and_apply_one_hot_encoding_then_returns_expected_feature_vector():
data = np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)
encoders = fit_one_hot_encoders(data)
assert apply_one_hot_encoding(data, encoders) == approx(
np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 2, 0, 1], [1, 0, 0, 3, 1, 0]])
)
def test_given_unknown_categorical_input_when_apply_one_hot_encoders_then_does_not_raise_error():
assert apply_one_hot_encoding(
np.array([["a", 4, "f"]]),
fit_one_hot_encoders(np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)),
) == approx(np.array([[1, 0, 0, 4, 0, 0]]))