Change one-hot-encoding behaviour in gcm module
Before, one dimension is dropped in the encoding. However, this requires to assume that there are no unknown categories, since these would be mapped to a zero vector as well, which then coincides with one of the categories. Now, there are as many dimensions as categories, which allow to map unknown categories to a zero vector. Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
This commit is contained in:
Родитель
b1e5804904
Коммит
a1dcccbc80
|
@ -61,7 +61,7 @@ def fit_one_hot_encoders(X: np.ndarray) -> Dict[int, OneHotEncoder]:
|
|||
one_hot_encoders = {}
|
||||
for column in range(X.shape[1]):
|
||||
if isinstance(X[0, column], str):
|
||||
one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
|
||||
one_hot_encoders[column] = OneHotEncoder(handle_unknown="ignore")
|
||||
one_hot_encoders[column].fit(X[:, column].reshape(-1, 1))
|
||||
|
||||
return one_hot_encoders
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from _pytest.python_api import approx
|
||||
|
||||
from dowhy.gcm.util.general import has_categorical, is_categorical
|
||||
from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders, has_categorical, is_categorical
|
||||
|
||||
|
||||
def test_given_categorical_data_when_evaluating_is_categorical_then_returns_expected_result():
|
||||
|
@ -18,3 +19,19 @@ def test_given_categorical_data_when_evaluating_has_categorical_then_returns_exp
|
|||
)
|
||||
assert has_categorical(pd.DataFrame({"X": [True, False, False], "Y": [1, 2, 3]}).to_numpy())
|
||||
assert not has_categorical(np.array([[1, 2, 3], [12.2, 2.3, 3.231]]))
|
||||
|
||||
|
||||
def test_given_categorical_data_when_fit_one_hot_encoders_and_apply_one_hot_encoding_then_returns_expected_feature_vector():
|
||||
data = np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)
|
||||
encoders = fit_one_hot_encoders(data)
|
||||
|
||||
assert apply_one_hot_encoding(data, encoders) == approx(
|
||||
np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 2, 0, 1], [1, 0, 0, 3, 1, 0]])
|
||||
)
|
||||
|
||||
|
||||
def test_given_unknown_categorical_input_when_apply_one_hot_encoders_then_does_not_raise_error():
|
||||
assert apply_one_hot_encoding(
|
||||
np.array([["a", 4, "f"]]),
|
||||
fit_one_hot_encoders(np.array([["d", 1, "a"], ["b", 2, "d"], ["a", 3, "a"]], dtype=object)),
|
||||
) == approx(np.array([[1, 0, 0, 4, 0, 0]]))
|
||||
|
|
Загрузка…
Ссылка в новой задаче