Merge branch 'main' of https://github.com/microsoft/hummingbird into main

2021-03-24 10:53:28 +01:00 · 2021-03-24 10:53:28 +01:00 · bdddf5c2cb
--- a/README.md
+++ b/README.md
@ -91,7 +91,7 @@ In general, Hummingbird syntax is very intuitive and minimal. To run your tradit
 ```python
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
-from hummingbird.ml import convert
+from hummingbird.ml import convert, load

 # Create some random data for binary classification
 num_classes = 2
@ -116,7 +116,7 @@ model.predict(X)
 model.save('hb_model')

 # Load the model back
-model = hummingbird.ml.load('hb_model')
+model = load('hb_model')
 ```

 # Documentation
--- a/hummingbird/ml/operator_converters/init.py
+++ b/hummingbird/ml/operator_converters/init.py
@ -21,6 +21,7 @@ from .onnx import onnx_operator  # noqa: E402
 from .onnx import array_feature_extractor as onnx_afe  # noqa: E402, F811
 from .onnx import binarizer as onnx_binarizer  # noqa: E402, F811
 from .onnx import feature_vectorizer  # noqa: E402
+from .onnx import imputer as onnx_imputer  # noqa: E402
 from .onnx import label_encoder as onnx_label_encoder  # noqa: E402, F811
 from .onnx import linear as onnx_linear  # noqa: E402, F811
 from .onnx import normalizer as onnx_normalizer  # noqa: E402, F811
--- a/hummingbird/ml/operator_converters/_imputer_implementations.py
+++ b/hummingbird/ml/operator_converters/_imputer_implementations.py
@ -0,0 +1,81 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""
+Base classes for Imputers
+"""
+
+import torch
+import numpy as np
+
+from ._physical_operator import PhysicalOperator
+from . import constants
+
+
+class SimpleImputer(PhysicalOperator, torch.nn.Module):
+    """
+    Class implementing SimpleImputer operators in PyTorch.
+    """
+
+    def __init__(self, logical_operator, device, statistics=None, missing=None, strategy=None):
+        super(SimpleImputer, self).__init__(logical_operator)
+        sklearn_imputer = logical_operator.raw_operator
+        # Pull out the stats field from either the SKL imputer or args
+        stats_ = statistics if statistics is not None else sklearn_imputer.statistics_
+        # Process the stats into an array
+        stats = [float(stat) for stat in stats_]
+
+        missing_values = missing if missing is not None else sklearn_imputer.missing_values
+        strategy = strategy if strategy is not None else sklearn_imputer.strategy
+
+        b_mask = np.logical_not(np.isnan(stats))
+        i_mask = [i for i in range(len(b_mask)) if b_mask[i]]
+        self.transformer = True
+        self.do_mask = strategy == "constant" or all(b_mask)
+        self.mask = torch.nn.Parameter(torch.LongTensor([] if self.do_mask else i_mask), requires_grad=False)
+        self.replace_values = torch.nn.Parameter(torch.tensor([stats_], dtype=torch.float32), requires_grad=False)
+
+        self.is_nan = True if (missing_values == "NaN" or np.isnan(missing_values)) else False
+        if not self.is_nan:
+            self.missing_values = torch.nn.Parameter(torch.tensor([missing_values], dtype=torch.float32), requires_grad=False)
+
+    def forward(self, x):
+        if self.is_nan:
+            result = torch.where(torch.isnan(x), self.replace_values.expand(x.shape), x)
+            if self.do_mask:
+                return result
+            return torch.index_select(result, 1, self.mask)
+        else:
+            return torch.where(torch.eq(x, self.missing_values), self.replace_values.expand(x.shape), x)
+
+
+class MissingIndicator(PhysicalOperator, torch.nn.Module):
+    """
+    Class implementing Imputer operators in MissingIndicator.
+    """
+
+    def __init__(self, logical_operator, device):
+        super(MissingIndicator, self).__init__(logical_operator)
+        sklearn_missing_indicator = logical_operator.raw_operator
+        self.transformer = True
+        self.missing_values = torch.nn.Parameter(
+            torch.tensor([sklearn_missing_indicator.missing_values], dtype=torch.float32), requires_grad=False
+        )
+        self.features = sklearn_missing_indicator.features
+        self.is_nan = True if (sklearn_missing_indicator.missing_values in ["NaN", None, np.nan]) else False
+        self.column_indices = torch.nn.Parameter(torch.LongTensor(sklearn_missing_indicator.features_), requires_grad=False)
+
+    def forward(self, x):
+        if self.is_nan:
+            if self.features == "all":
+                return torch.isnan(x).float()
+            else:
+                return torch.isnan(torch.index_select(x, 1, self.column_indices)).float()
+        else:
+            if self.features == "all":
+                return torch.eq(x, self.missing_values).float()
+            else:
+                return torch.eq(torch.index_select(x, 1, self.column_indices), self.missing_values).float()
--- a/hummingbird/ml/operator_converters/onnx/imputer.py
+++ b/hummingbird/ml/operator_converters/onnx/imputer.py
@ -0,0 +1,44 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""
+Converter for ONNX-ML Imputer.
+"""
+
+import numpy as np
+from onnxconverter_common.registration import register_converter
+
+from .._imputer_implementations import SimpleImputer
+
+
+def convert_onnx_imputer(operator, device=None, extra_config={}):
+    """
+    Converter for `ai.onnx.ml.Imputer`
+
+    Args:
+        operator: An operator wrapping a `ai.onnx.ml.Imputer` model
+        device: String defining the type of device the converted operator should be run on
+        extra_config: Extra configuration used to select the best conversion strategy
+
+    Returns:
+        A PyTorch model
+    """
+
+    stats = missing = None
+    for attr in operator.raw_operator.origin.attribute:
+        if attr.name == "imputed_value_floats":
+            stats = np.array(attr.floats).astype("float64")
+        elif attr.name == "replaced_value_float":
+            missing = attr.f
+
+    if any(v is None for v in [stats, missing]):
+        raise RuntimeError("Error parsing Imputer, found unexpected None. stats: {}, missing: {}", stats, missing)
+
+    # ONNXML has no "strategy" field, but always behaves similar to SKL's constant: "replace missing values with fill_value"
+    return SimpleImputer(operator, device, statistics=stats, missing=missing, strategy="constant")
+
+
+register_converter("ONNXMLImputer", convert_onnx_imputer)
--- a/hummingbird/ml/operator_converters/sklearn/imputer.py
+++ b/hummingbird/ml/operator_converters/sklearn/imputer.py
@ -12,39 +12,7 @@ import numpy as np
 from onnxconverter_common.registration import register_converter
 import torch

-
-class SimpleImputer(PhysicalOperator, torch.nn.Module):
-    """
-    Class implementing SimpleImputer operators in PyTorch.
-    """
-
-    def __init__(self, logical_operator, device):
-        super(SimpleImputer, self).__init__(logical_operator)
-        sklearn_imputer = logical_operator.raw_operator
-        stats = [float(stat) for stat in sklearn_imputer.statistics_ if isinstance(stat, float)]
-        b_mask = np.logical_not(np.isnan(stats))
-        i_mask = [i for i in range(len(b_mask)) if b_mask[i]]
-        self.transformer = True
-        self.do_mask = sklearn_imputer.strategy == "constant" or all(b_mask)
-        self.mask = torch.nn.Parameter(torch.LongTensor([] if self.do_mask else i_mask), requires_grad=False)
-        self.replace_values = torch.nn.Parameter(
-            torch.tensor([sklearn_imputer.statistics_], dtype=torch.float32), requires_grad=False
-        )
-
-        self.is_nan = True if (sklearn_imputer.missing_values == "NaN" or np.isnan(sklearn_imputer.missing_values)) else False
-        if not self.is_nan:
-            self.missing_values = torch.nn.Parameter(
-                torch.tensor([sklearn_imputer.missing_values], dtype=torch.float32), requires_grad=False
-            )
-
-    def forward(self, x):
-        if self.is_nan:
-            result = torch.where(torch.isnan(x), self.replace_values.expand(x.shape), x)
-            if self.do_mask:
-                return result
-            return torch.index_select(result, 1, self.mask)
-        else:
-            return torch.where(torch.eq(x, self.missing_values), self.replace_values.expand(x.shape), x)
+from .._imputer_implementations import SimpleImputer, MissingIndicator


 def convert_sklearn_simple_imputer(operator, device, extra_config):
@ -64,35 +32,6 @@ def convert_sklearn_simple_imputer(operator, device, extra_config):
    return SimpleImputer(operator, device)


-class MissingIndicator(PhysicalOperator, torch.nn.Module):
-    """
-    Class implementing Imputer operators in MissingIndicator.
-    """
-
-    def __init__(self, logical_operator, device):
-        super(MissingIndicator, self).__init__(logical_operator)
-        sklearn_missing_indicator = logical_operator.raw_operator
-        self.transformer = True
-        self.missing_values = torch.nn.Parameter(
-            torch.tensor([sklearn_missing_indicator.missing_values], dtype=torch.float32), requires_grad=False
-        )
-        self.features = sklearn_missing_indicator.features
-        self.is_nan = True if (sklearn_missing_indicator.missing_values in ["NaN", None, np.nan]) else False
-        self.column_indices = torch.nn.Parameter(torch.LongTensor(sklearn_missing_indicator.features_), requires_grad=False)
-
-    def forward(self, x):
-        if self.is_nan:
-            if self.features == "all":
-                return torch.isnan(x).float()
-            else:
-                return torch.isnan(torch.index_select(x, 1, self.column_indices)).float()
-        else:
-            if self.features == "all":
-                return torch.eq(x, self.missing_values).float()
-            else:
-                return torch.eq(torch.index_select(x, 1, self.column_indices), self.missing_values).float()
-
-
 def convert_sklearn_missing_indicator(operator, device, extra_config):
    """
    Converter for `sklearn.impute.MissingIndicator`
--- a/hummingbird/ml/supported.py
+++ b/hummingbird/ml/supported.py
@ -77,6 +77,7 @@ Binarizer,
 Cast,
 Concat,
 Div,
+Imputer,
 LabelEncoder,
 Less,
 LinearClassifier,
@ -313,6 +314,7 @@ def _build_onnxml_operator_list():
            "ArrayFeatureExtractor",
            "Binarizer",
            "FeatureVectorizer",
+            "Imputer",
            "LabelEncoder",
            "OneHotEncoder",
            "Normalizer",
--- a/notebooks/LGBM_year_with_train.ipynb
+++ b/notebooks/LGBM_year_with_train.ipynb
--- a/notebooks/sklearn_year_with_train.ipynb
+++ b/notebooks/sklearn_year_with_train.ipynb
--- a/setup.py
+++ b/setup.py
@ -22,7 +22,7 @@ with open(README) as f:
        long_description = long_description[start_pos:]

 install_requires = [
-    "numpy>=1.15,<=1.19.4",
+    "numpy>=1.15,<=1.20.*",
    "onnxconverter-common>=1.6.0,<=1.7.0",
    "scipy<=1.5.4",
    "scikit-learn>=0.21.3,<=0.23.2",
--- a/tests/test_onnxml_imputer_converter.py
+++ b/tests/test_onnxml_imputer_converter.py
@ -0,0 +1,101 @@
+"""
+Tests onnxml Imputer converter
+"""
+import unittest
+import warnings
+
+import numpy as np
+import torch
+from sklearn.impute import SimpleImputer
+
+from hummingbird.ml._utils import onnx_ml_tools_installed, onnx_runtime_installed, lightgbm_installed
+from hummingbird.ml import convert
+
+if onnx_runtime_installed():
+    import onnxruntime as ort
+if onnx_ml_tools_installed():
+    from onnxmltools import convert_sklearn
+    from onnxmltools.convert.common.data_types import FloatTensorType as FloatTensorType_onnx
+
+
+class TestONNXImputer(unittest.TestCase):
+    def _test_imputer_converter(self, model, mode="onnx"):
+        warnings.filterwarnings("ignore")
+        X = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32)
+        model.fit(X)
+
+        # Create ONNX-ML model
+        onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))])
+
+        # Get the predictions for the ONNX-ML model
+        session = ort.InferenceSession(onnx_ml_model.SerializeToString())
+        output_names = [session.get_outputs()[i].name for i in range(len(session.get_outputs()))]
+        inputs = {session.get_inputs()[0].name: X}
+        onnx_ml_pred = session.run(output_names, inputs)[0]
+
+        # Create test model by calling converter
+        model = convert(onnx_ml_model, mode, X)
+
+        # Get the predictions for the test model
+        pred = model.transform(X)
+
+        return onnx_ml_pred, pred
+
+    @unittest.skipIf(
+        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test requires ONNX, ORT and ONNXMLTOOLS"
+    )
+    def test_onnx_imputer_const(self, rtol=1e-06, atol=1e-06):
+        model = SimpleImputer(strategy="constant")
+        onnx_ml_pred, onnx_pred = self._test_imputer_converter(model)
+
+        # Check that predicted values match
+        np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
+
+    @unittest.skipIf(
+        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test requires ONNX, ORT and ONNXMLTOOLS"
+    )
+    def test_onnx_imputer_const_nan0(self, rtol=1e-06, atol=1e-06):
+        model = SimpleImputer(strategy="constant", fill_value=0)
+        onnx_ml_pred, onnx_pred = self._test_imputer_converter(model)
+
+        # Check that predicted values match
+        np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
+
+    @unittest.skipIf(
+        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test requires ONNX, ORT and ONNXMLTOOLS"
+    )
+    def test_onnx_imputer_mean(self, rtol=1e-06, atol=1e-06):
+        model = SimpleImputer(strategy="mean", fill_value="nan")
+        onnx_ml_pred, onnx_pred = self._test_imputer_converter(model)
+
+        # Check that predicted values match
+        np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
+
+    @unittest.skipIf(
+        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test requires ONNX, ORT and ONNXMLTOOLS"
+    )
+    def test_onnx_imputer_converter_raises_rt(self):
+        warnings.filterwarnings("ignore")
+        model = SimpleImputer(strategy="mean", fill_value="nan")
+        X = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32)
+        model.fit(X)
+
+        # Create ONNX-ML model
+        onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))])
+        onnx_ml_model.graph.node[0].attribute[0].name = "".encode()
+
+        self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
+
+    @unittest.skipIf(
+        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test requires ONNX, ORT and ONNXMLTOOLS"
+    )
+    def test_onnx_imputer_torch(self, rtol=1e-06, atol=1e-06):
+        model = SimpleImputer(strategy="constant")
+        onnx_ml_pred, onnx_pred = self._test_imputer_converter(model, mode="torch")
+
+        # Check that predicted values match
+        np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_sklearn_decision_tree_converter.py
+++ b/tests/test_sklearn_decision_tree_converter.py
@ -52,6 +52,12 @@ class TestSklearnTreeConverter(unittest.TestCase):
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06)

+            from distutils.version import LooseVersion
+            import torch
+
+            if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
+                np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
+
    # Random forest binary classifier
    def test_random_forest_classifier_binary_converter(self):
        self._run_tree_classification_converter(RandomForestClassifier, 2, n_estimators=10)