Add basic support for scikit-learn pipeline (#251)

* Add support for SKL pipelines
This commit is contained in:
Matteo Interlandi 2020-08-28 13:20:45 -07:00 коммит произвёл GitHub
Родитель 905f8f36b0
Коммит 39a79be15c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
22 изменённых файлов: 1352 добавлений и 109 удалений

1
.github/workflows/pythonapp.yml поставляемый
Просмотреть файл

@ -62,6 +62,7 @@ jobs:
- name: Install extra dependencies
run: |
pip install .[extra,onnx]
pip install pandas
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names

Просмотреть файл

@ -26,7 +26,7 @@
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/convert.py#L0-L238" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/convert.py#L0-L246" class="git-link">Browse git</a>
</summary>
<pre><code class="python"># -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
@ -193,9 +193,9 @@ def _convert_onnxml(model, backend, test_input, device, extra_config={}):
type(initial_types[0][1])
)
)
else:
extra_config[constants.N_FEATURES] = np.array(test_input).shape[1]
extra_config[constants.TEST_INPUT] = test_input
extra_config[constants.TEST_INPUT] = test_input
elif constants.N_FEATURES not in extra_config:
extra_config[constants.N_FEATURES] = test_input.shape[1]
# Set the initializers. Some converter requires the access to initializers.
initializers = {} if model.graph.initializer is None else {in_.name: in_ for in_ in model.graph.initializer}
@ -249,6 +249,14 @@ def convert(model, backend, test_input=None, device=&#34;cpu&#34;, extra_config=
if test_input is not None and constants.TEST_INPUT not in extra_config:
extra_config[constants.TEST_INPUT] = test_input
# Fix the test_input type
if constants.TEST_INPUT in extra_config:
if type(extra_config[constants.TEST_INPUT]) == list:
extra_config[constants.TEST_INPUT] = np.array(extra_config[constants.TEST_INPUT])
elif type(extra_config[constants.TEST_INPUT]) == tuple:
extra_config[constants.N_FEATURES] = len(extra_config[constants.TEST_INPUT])
test_input = extra_config[constants.TEST_INPUT]
# We do some normalization on backends.
backend = backend.lower()
backend = backends[backend]
@ -310,7 +318,7 @@ The set of supported extra configurations can be found at <code><a title="hummin
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/convert.py#L182-L239" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/convert.py#L182-L247" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert(model, backend, test_input=None, device=&#34;cpu&#34;, extra_config={}):
&#34;&#34;&#34;
@ -352,6 +360,14 @@ The set of supported extra configurations can be found at <code><a title="hummin
if test_input is not None and constants.TEST_INPUT not in extra_config:
extra_config[constants.TEST_INPUT] = test_input
# Fix the test_input type
if constants.TEST_INPUT in extra_config:
if type(extra_config[constants.TEST_INPUT]) == list:
extra_config[constants.TEST_INPUT] = np.array(extra_config[constants.TEST_INPUT])
elif type(extra_config[constants.TEST_INPUT]) == tuple:
extra_config[constants.N_FEATURES] = len(extra_config[constants.TEST_INPUT])
test_input = extra_config[constants.TEST_INPUT]
# We do some normalization on backends.
backend = backend.lower()
backend = backends[backend]

Просмотреть файл

@ -26,7 +26,7 @@
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/__init__.py#L0-L48" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/__init__.py#L0-L49" class="git-link">Browse git</a>
</summary>
<pre><code class="python"># -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
@ -62,6 +62,7 @@ from .sklearn import iforest # noqa: E402
from .sklearn import linear as sklearn_linear # noqa: E402
from .sklearn import normalizer as sklearn_normalizer # noqa: E402
from .sklearn import one_hot_encoder as sklearn_ohe # noqa: E402
from .sklearn import pipeline # noqa: E402
from .sklearn import scaler as sklearn_scaler # noqa: E402
from .sklearn import sv # noqa: E402
from . import lightgbm # noqa: E402

Просмотреть файл

@ -26,7 +26,7 @@
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L0-L115" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L0-L120" class="git-link">Browse git</a>
</summary>
<pre><code class="python"># -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
@ -55,6 +55,8 @@ class Cast(BaseOperator, torch.nn.Module):
self.to_type = to_type
def forward(self, x):
if self.to_type == 1: # Cast to float
return x.float()
if self.to_type == 7: # Cast to long
return x.long()
@ -64,7 +66,10 @@ class Concat(BaseOperator, torch.nn.Module):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)
if len(x[0].shape) &gt; 1:
return torch.cat(x, dim=1)
else:
return torch.stack(x, dim=1)
class Reshape(BaseOperator, torch.nn.Module):
@ -175,7 +180,7 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L50-L71" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L55-L76" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_onnx_cast(operator, device=None, extra_config={}):
&#34;&#34;&#34;
@ -223,7 +228,7 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L74-L89" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L79-L94" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_onnx_concat(operator, device=None, extra_config={}):
&#34;&#34;&#34;
@ -265,7 +270,7 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L92-L111" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L97-L116" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_onnx_reshape(operator, device=None, extra_config={}):
&#34;&#34;&#34;
@ -303,7 +308,7 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L19-L29" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L19-L31" class="git-link">Browse git</a>
</summary>
<pre><code class="python">class Cast(BaseOperator, torch.nn.Module):
def __init__(self, to_type):
@ -314,6 +319,8 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
self.to_type = to_type
def forward(self, x):
if self.to_type == 1: # Cast to float
return x.float()
if self.to_type == 7: # Cast to long
return x.long()</code></pre>
</details>
@ -326,24 +333,18 @@ register_converter(&#34;ONNXMLReshape&#34;, convert_onnx_reshape)</code></pre>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.onnx.onnx_operator.Cast.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, x)</span>
<span>def <span class="ident">forward</span></span>(<span>self, x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"><p>Defines the computation performed at every call.</p>
<p>Should be overridden by all subclasses.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Although the recipe for forward pass needs to be defined within
this function, one should call the :class:<code>Module</code> instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.</p>
</div></div>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L27-L29" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L27-L31" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def forward(self, x):
if self.to_type == 1: # Cast to float
return x.float()
if self.to_type == 7: # Cast to long
return x.long()</code></pre>
</details>
@ -358,14 +359,17 @@ registered hooks while the latter silently ignores them.</p>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L32-L37" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L34-L42" class="git-link">Browse git</a>
</summary>
<pre><code class="python">class Concat(BaseOperator, torch.nn.Module):
def __init__(self):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)</code></pre>
if len(x[0].shape) &gt; 1:
return torch.cat(x, dim=1)
else:
return torch.stack(x, dim=1)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
@ -376,25 +380,20 @@ registered hooks while the latter silently ignores them.</p>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.onnx.onnx_operator.Concat.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, *x)</span>
<span>def <span class="ident">forward</span></span>(<span>self, *x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"><p>Defines the computation performed at every call.</p>
<p>Should be overridden by all subclasses.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Although the recipe for forward pass needs to be defined within
this function, one should call the :class:<code>Module</code> instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.</p>
</div></div>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L36-L37" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L38-L42" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def forward(self, *x):
return torch.cat(x, dim=1)</code></pre>
if len(x[0].shape) &gt; 1:
return torch.cat(x, dim=1)
else:
return torch.stack(x, dim=1)</code></pre>
</details>
</dd>
</dl>
@ -408,7 +407,7 @@ registered hooks while the latter silently ignores them.</p>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L40-L47" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L45-L52" class="git-link">Browse git</a>
</summary>
<pre><code class="python">class Reshape(BaseOperator, torch.nn.Module):
def __init__(self, shape):
@ -428,22 +427,14 @@ registered hooks while the latter silently ignores them.</p>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.onnx.onnx_operator.Reshape.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, x)</span>
<span>def <span class="ident">forward</span></span>(<span>self, x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"><p>Defines the computation performed at every call.</p>
<p>Should be overridden by all subclasses.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Although the recipe for forward pass needs to be defined within
this function, one should call the :class:<code>Module</code> instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.</p>
</div></div>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L46-L47" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/onnx/onnx_operator.py#L51-L52" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def forward(self, x):
return torch.reshape(x, self.shape)</code></pre>

Просмотреть файл

@ -158,18 +158,10 @@ register_converter(&#34;SklearnBinarizer&#34;, convert_sklearn_binarizer)</code>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.binarizer.Binarizer.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, x)</span>
<span>def <span class="ident">forward</span></span>(<span>self, x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"><p>Defines the computation performed at every call.</p>
<p>Should be overridden by all subclasses.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Although the recipe for forward pass needs to be defined within
this function, one should call the :class:<code>Module</code> instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.</p>
</div></div>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>

Просмотреть файл

@ -74,6 +74,10 @@ All scikit-learn operators converters are stored under this package.
<dd>
<div class="desc"><p>Converter for scikit-learn one hot encoder.</p></div>
</dd>
<dt><code class="name"><a title="hummingbird.ml.operator_converters.sklearn.pipeline" href="pipeline.html">hummingbird.ml.operator_converters.sklearn.pipeline</a></code></dt>
<dd>
<div class="desc"><p>Converters for operators necessary for supporting scikit-learn Pipelines.</p></div>
</dd>
<dt><code class="name"><a title="hummingbird.ml.operator_converters.sklearn.scaler" href="scaler.html">hummingbird.ml.operator_converters.sklearn.scaler</a></code></dt>
<dd>
<div class="desc"><p>Converters for scikit-learn scalers: RobustScaler, MaxAbsScaler, MinMaxScaler, StandardScaler.</p></div>
@ -117,6 +121,7 @@ All scikit-learn operators converters are stored under this package.
<li><code><a title="hummingbird.ml.operator_converters.sklearn.linear" href="linear.html">hummingbird.ml.operator_converters.sklearn.linear</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.normalizer" href="normalizer.html">hummingbird.ml.operator_converters.sklearn.normalizer</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.one_hot_encoder" href="one_hot_encoder.html">hummingbird.ml.operator_converters.sklearn.one_hot_encoder</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline" href="pipeline.html">hummingbird.ml.operator_converters.sklearn.pipeline</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.scaler" href="scaler.html">hummingbird.ml.operator_converters.sklearn.scaler</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.sv" href="sv.html">hummingbird.ml.operator_converters.sklearn.sv</a></code></li>
</ul>

Просмотреть файл

@ -60,7 +60,7 @@ def convert_sklearn_linear_model(operator, device, extra_config):
&#34;&#34;&#34;
classes = [0] if not hasattr(operator.raw_operator, &#34;classes_&#34;) else operator.raw_operator.classes_
if not all([type(x) in [int, np.int32, np.int64] for x in classes]):
if not all([&#34;int&#34; in str(type(x)) for x in classes]):
raise RuntimeError(
&#34;Hummingbird currently supports only integer labels for class labels. Please file an issue at https://github.com/microsoft/hummingbird.&#34;
)
@ -153,7 +153,7 @@ register_converter(&#34;SklearnLogisticRegressionCV&#34;, convert_sklearn_linear
&#34;&#34;&#34;
classes = [0] if not hasattr(operator.raw_operator, &#34;classes_&#34;) else operator.raw_operator.classes_
if not all([type(x) in [int, np.int32, np.int64] for x in classes]):
if not all([&#34;int&#34; in str(type(x)) for x in classes]):
raise RuntimeError(
&#34;Hummingbird currently supports only integer labels for class labels. Please file an issue at https://github.com/microsoft/hummingbird.&#34;
)

Просмотреть файл

@ -0,0 +1,416 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.8.1" />
<title>hummingbird.ml.operator_converters.sklearn.pipeline API documentation</title>
<meta name="description" content="Converters for operators necessary for supporting scikit-learn Pipelines." />
<link href='https://cdnjs.cloudflare.com/ajax/libs/normalize/8.0.0/normalize.min.css' rel='stylesheet'>
<link href='https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/8.0.0/sanitize.min.css' rel='stylesheet'>
<link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" rel="stylesheet">
<style>.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<style>.homelink{display:block;font-size:2em;font-weight:bold;color:#555;padding-bottom:.5em;border-bottom:1px solid silver}.homelink:hover{color:inherit}.homelink img{max-width:20%;max-height:5em;margin:auto;margin-bottom:.3em}</style>
<link rel="canonical" href="https://microsoft.github.io/hummingbird/ml/operator_converters/sklearn/pipeline.html">
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>hummingbird.ml.operator_converters.sklearn.pipeline</code></h1>
</header>
<section id="section-intro">
<p>Converters for operators necessary for supporting scikit-learn Pipelines.</p>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L0-L101" class="git-link">Browse git</a>
</summary>
<pre><code class="python"># -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
&#34;&#34;&#34;
Converters for operators necessary for supporting scikit-learn Pipelines.
&#34;&#34;&#34;
import numpy as np
from onnxconverter_common.registration import register_converter
import torch
from .. import constants
from .._array_feature_extractor_implementations import ArrayFeatureExtractor
from .._base_operator import BaseOperator
class Concat(BaseOperator, torch.nn.Module):
&#34;&#34;&#34;
Module used to concatenate tensors into a single tensor.
&#34;&#34;&#34;
def __init__(self):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)
class Multiply(BaseOperator, torch.nn.Module):
&#34;&#34;&#34;
Module used to multiply features in a pipeline by a score.
&#34;&#34;&#34;
def __init__(self, score):
super(Multiply, self).__init__()
self.score = score
def forward(self, x):
return x * self.score
def convert_sklearn_array_feature_extractor(operator, device, extra_config):
&#34;&#34;&#34;
Converter for ArrayFeatureExtractor.
Args:
operator: An operator wrapping a ArrayFeatureExtractor operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
assert operator is not None
indices = operator.column_indices
return ArrayFeatureExtractor(np.ascontiguousarray(indices), device)
def convert_sklearn_concat(operator, device=None, extra_config={}):
&#34;&#34;&#34;
Converter for concat operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
return Concat()
def convert_sklearn_multiply(operator, device=None, extra_config={}):
&#34;&#34;&#34;
Converter for multiply operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
assert operator is not None
assert hasattr(operator, &#34;operand&#34;)
score = operator.operand
# Generate the model.
return Multiply(score)
register_converter(&#34;SklearnArrayFeatureExtractor&#34;, convert_sklearn_array_feature_extractor)
register_converter(&#34;SklearnConcat&#34;, convert_sklearn_concat)
register_converter(&#34;SklearnMultiply&#34;, convert_sklearn_multiply)</code></pre>
</details>
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_array_feature_extractor"><code class="name flex">
<span>def <span class="ident">convert_sklearn_array_feature_extractor</span></span>(<span>operator, device, extra_config)</span>
</code></dt>
<dd>
<div class="desc"><p>Converter for ArrayFeatureExtractor.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>operator</code></strong></dt>
<dd>An operator wrapping a ArrayFeatureExtractor operator</dd>
<dt><strong><code>device</code></strong></dt>
<dd>String defining the type of device the converted operator should be run on</dd>
<dt><strong><code>extra_config</code></strong></dt>
<dd>Extra configuration used to select the best conversion strategy</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>A PyTorch model</code></dt>
<dd>&nbsp;</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L46-L61" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_sklearn_array_feature_extractor(operator, device, extra_config):
&#34;&#34;&#34;
Converter for ArrayFeatureExtractor.
Args:
operator: An operator wrapping a ArrayFeatureExtractor operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
assert operator is not None
indices = operator.column_indices
return ArrayFeatureExtractor(np.ascontiguousarray(indices), device)</code></pre>
</details>
</dd>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_concat"><code class="name flex">
<span>def <span class="ident">convert_sklearn_concat</span></span>(<span>operator, device=None, extra_config={})</span>
</code></dt>
<dd>
<div class="desc"><p>Converter for concat operators injected when parsing Sklearn pipelines.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>operator</code></strong></dt>
<dd>An empty operator</dd>
<dt><strong><code>device</code></strong></dt>
<dd>String defining the type of device the converted operator should be run on</dd>
<dt><strong><code>extra_config</code></strong></dt>
<dd>Extra configuration used to select the best conversion strategy</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>A PyTorch model</code></dt>
<dd>&nbsp;</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L64-L76" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_sklearn_concat(operator, device=None, extra_config={}):
&#34;&#34;&#34;
Converter for concat operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
return Concat()</code></pre>
</details>
</dd>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_multiply"><code class="name flex">
<span>def <span class="ident">convert_sklearn_multiply</span></span>(<span>operator, device=None, extra_config={})</span>
</code></dt>
<dd>
<div class="desc"><p>Converter for multiply operators injected when parsing Sklearn pipelines.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>operator</code></strong></dt>
<dd>An empty operator</dd>
<dt><strong><code>device</code></strong></dt>
<dd>String defining the type of device the converted operator should be run on</dd>
<dt><strong><code>extra_config</code></strong></dt>
<dd>Extra configuration used to select the best conversion strategy</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>A PyTorch model</code></dt>
<dd>&nbsp;</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L79-L97" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def convert_sklearn_multiply(operator, device=None, extra_config={}):
&#34;&#34;&#34;
Converter for multiply operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
&#34;&#34;&#34;
assert operator is not None
assert hasattr(operator, &#34;operand&#34;)
score = operator.operand
# Generate the model.
return Multiply(score)</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.Concat"><code class="flex name class">
<span>class <span class="ident">Concat</span></span>
</code></dt>
<dd>
<div class="desc"><p>Module used to concatenate tensors into a single tensor.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L20-L29" class="git-link">Browse git</a>
</summary>
<pre><code class="python">class Concat(BaseOperator, torch.nn.Module):
&#34;&#34;&#34;
Module used to concatenate tensors into a single tensor.
&#34;&#34;&#34;
def __init__(self):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li>hummingbird.ml.operator_converters._base_operator.BaseOperator</li>
<li>abc.ABC</li>
<li>torch.nn.modules.module.Module</li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.Concat.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, *x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L28-L29" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def forward(self, *x):
return torch.cat(x, dim=1)</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.Multiply"><code class="flex name class">
<span>class <span class="ident">Multiply</span></span>
<span>(</span><span>score)</span>
</code></dt>
<dd>
<div class="desc"><p>Module used to multiply features in a pipeline by a score.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L32-L43" class="git-link">Browse git</a>
</summary>
<pre><code class="python">class Multiply(BaseOperator, torch.nn.Module):
&#34;&#34;&#34;
Module used to multiply features in a pipeline by a score.
&#34;&#34;&#34;
def __init__(self, score):
super(Multiply, self).__init__()
self.score = score
def forward(self, x):
return x * self.score</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li>hummingbird.ml.operator_converters._base_operator.BaseOperator</li>
<li>abc.ABC</li>
<li>torch.nn.modules.module.Module</li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.pipeline.Multiply.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/operator_converters/sklearn/pipeline.py#L42-L43" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def forward(self, x):
return x * self.score</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<header>
<a class="homelink" rel="home" title="Hummingbird Home" href="https://github.com/microsoft/hummingbird"> Hummingbird
</a>
</header>
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="hummingbird.ml.operator_converters.sklearn" href="index.html">hummingbird.ml.operator_converters.sklearn</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_array_feature_extractor" href="#hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_array_feature_extractor">convert_sklearn_array_feature_extractor</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_concat" href="#hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_concat">convert_sklearn_concat</a></code></li>
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_multiply" href="#hummingbird.ml.operator_converters.sklearn.pipeline.convert_sklearn_multiply">convert_sklearn_multiply</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.Concat" href="#hummingbird.ml.operator_converters.sklearn.pipeline.Concat">Concat</a></code></h4>
<ul class="">
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.Concat.forward" href="#hummingbird.ml.operator_converters.sklearn.pipeline.Concat.forward">forward</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.Multiply" href="#hummingbird.ml.operator_converters.sklearn.pipeline.Multiply">Multiply</a></code></h4>
<ul class="">
<li><code><a title="hummingbird.ml.operator_converters.sklearn.pipeline.Multiply.forward" href="#hummingbird.ml.operator_converters.sklearn.pipeline.Multiply.forward">forward</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc"><cite>pdoc</cite> 0.8.1</a>.</p>
</footer>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad()</script>
</body>
</html>

Просмотреть файл

@ -303,18 +303,10 @@ register_converter(&#34;SklearnNuSVC&#34;, convert_sklearn_svc_model)</code></pr
<h3>Methods</h3>
<dl>
<dt id="hummingbird.ml.operator_converters.sklearn.sv.SVC.forward"><code class="name flex">
<span>def <span class="ident">forward</span></span>(<span>self, x)</span>
<span>def <span class="ident">forward</span></span>(<span>self, x) -> Callable[..., Any]</span>
</code></dt>
<dd>
<div class="desc"><p>Defines the computation performed at every call.</p>
<p>Should be overridden by all subclasses.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Although the recipe for forward pass needs to be defined within
this function, one should call the :class:<code>Module</code> instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.</p>
</div></div>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>

Просмотреть файл

@ -64,7 +64,7 @@ XGBRegressor</p>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L0-L295" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L0-L305" class="git-link">Browse git</a>
</summary>
<pre><code class="python"># -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
@ -288,7 +288,17 @@ def _build_sklearn_api_operator_name_map():
Associate Sklearn with the operator class names.
If two scikit-learn (API) models share a single name, it means they are equivalent in terms of conversion.
&#34;&#34;&#34;
return {k: &#34;Sklearn&#34; + k.__name__ for k in sklearn_operator_list + xgb_operator_list + lgbm_operator_list}
# Pipeline ops. These are ops injected by the parser not &#34;real&#34; sklearn operators.
pipeline_operator_list = [
&#34;ArrayFeatureExtractor&#34;,
&#34;Concat&#34;,
&#34;Multiply&#34;,
]
return {
k: &#34;Sklearn&#34; + k.__name__ if hasattr(k, &#34;__name__&#34;) else k
for k in sklearn_operator_list + pipeline_operator_list + xgb_operator_list + lgbm_operator_list
}
def _build_onnxml_api_operator_name_map():
@ -417,7 +427,7 @@ CONTAINER = &#34;container&#34;
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L250-L263" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L260-L273" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def get_onnxml_api_operator_name(model_type):
&#34;&#34;&#34;
@ -454,7 +464,7 @@ or an object with scikit-learn API (e.g., LightGBM)</dd>
<details class="source">
<summary>
<span>Expand source code</span>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L234-L247" class="git-link">Browse git</a>
<a href="https://github.com/microsoft/hummingbird/blob/master/hummingbird/ml/supported.py#L244-L257" class="git-link">Browse git</a>
</summary>
<pre><code class="python">def get_sklearn_api_operator_name(model_type):
&#34;&#34;&#34;

Просмотреть файл

@ -52,24 +52,54 @@ class PyTorchBackendModel(torch.nn.Module):
extra_config: Some additional custom configuration parameter
"""
super(PyTorchBackendModel, self).__init__()
self._input_names = input_names
self._output_names = output_names
# Define input \ output names.
# This is required because the internal variable names may differ from the original (raw) one.
# This may happen, for instance, because we force our internal naming to be unique.
def _fix_var_naming(operators, names, mod="input"):
new_names = []
map = {}
for op in operators:
if mod == "input":
iter = op.inputs
else:
iter = op.outputs
for i in iter:
for name in names:
if i.raw_name == name and name not in map:
map[i.raw_name] = i.full_name
if len(map) == len(names):
break
for name in names:
new_names.append(map[name])
return new_names
self._input_names = _fix_var_naming(operators, input_names)
self._output_names = _fix_var_naming(reversed(operators), output_names, "output")
self._operator_map = torch.nn.ModuleDict(operator_map)
self._operators = operators
def forward(self, *inputs):
with torch.no_grad():
assert len(self._input_names) == len(inputs)
inputs = [*inputs]
variable_map = {}
device = _get_device(self)
# Maps data inputs to the expected variables.
for i, input_name in enumerate(self._input_names):
if type(inputs[i]) is list:
inputs[i] = np.array(inputs[i])
if type(inputs[i]) is np.ndarray:
inputs[i] = torch.from_numpy(inputs[i]).float()
inputs[i] = torch.from_numpy(inputs[i])
if inputs[i].dtype == torch.float64:
# We convert double precision arrays into single precision. Sklearn does the same.
inputs[i] = inputs[i].float()
elif type(inputs[i]) is not torch.Tensor:
raise RuntimeError("Inputer tensor {} of not supported type {}".format(input_name, type(inputs[i])))
if device != "cpu":
if device is not None and device.type != "cpu":
inputs[i] = inputs[i].to(device)
variable_map[input_name] = inputs[i]
@ -312,16 +342,19 @@ class ONNXSklearnContainer(ABC):
def model(self):
return self._model
def _get_named_inputs(self, *inputs):
def _get_named_inputs(self, inputs):
"""
Retrieve the inputs names from the session object.
"""
if len(inputs) < len(self.input_names):
inputs = inputs[0]
assert len(inputs) == len(self.input_names)
named_inputs = {}
for i in range(len(inputs)):
named_inputs[self.input_names[i]] = inputs[i]
named_inputs[self.input_names[i]] = np.array(inputs[i])
return named_inputs
@ -341,7 +374,7 @@ class ONNXSklearnContainerTransformer(ONNXSklearnContainer):
Utility functions used to emulate the behavior of the Sklearn API.
On data transformers it returns transformed output data
"""
named_inputs = self._get_named_inputs(*inputs)
named_inputs = self._get_named_inputs(inputs)
return self._session.run(self._output_names, named_inputs)
@ -368,7 +401,7 @@ class ONNXSklearnContainerRegression(ONNXSklearnContainer):
On classification tasks returns the predicted class labels for the input data.
On anomaly detection (e.g. isolation forest) returns the predicted classes (-1 or 1).
"""
named_inputs = self._get_named_inputs(*inputs)
named_inputs = self._get_named_inputs(inputs)
if self._is_regression:
return self._session.run(self._output_names, named_inputs)
@ -393,7 +426,7 @@ class ONNXSklearnContainerClassification(ONNXSklearnContainerRegression):
Utility functions used to emulate the behavior of the Sklearn API.
On classification tasks returns the probability estimates.
"""
named_inputs = self._get_named_inputs(*inputs)
named_inputs = self._get_named_inputs(inputs)
return self._session.run([self._output_names[1]], named_inputs)[0]
@ -415,7 +448,7 @@ class ONNXSklearnContainerAnomalyDetection(ONNXSklearnContainerRegression):
Utility functions used to emulate the behavior of the Sklearn API.
On anomaly detection (e.g. isolation forest) returns the decision function scores.
"""
named_inputs = self._get_named_inputs(*inputs)
named_inputs = self._get_named_inputs(inputs)
return np.array(self._session.run([self._output_names[1]], named_inputs)[0]).flatten()

Просмотреть файл

@ -6,13 +6,18 @@
"""
All functions used for parsing input models are listed here.
Some code here have been copied from https://github.com/onnx/sklearn-onnx/.
"""
from collections import OrderedDict
from copy import deepcopy
from uuid import uuid4
from onnxconverter_common.container import CommonSklearnModelContainer
from onnxconverter_common.optimizer import LinkedNode, _topological_sort
from onnxconverter_common.topology import Topology
from sklearn import pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from . import constants
from ._container import CommonONNXModelContainer
@ -162,33 +167,130 @@ def _parse_sklearn_single_model(scope, model, inputs):
def _parse_sklearn_pipeline(scope, model, inputs):
"""
The basic ideas of scikit-learn pipeline parsing:
The basic ideas of scikit-learn parsing:
1. Sequentially go though all stages defined in the considered
scikit-learn pipeline
2. The output `onnxconverter_common.topology.Variable`s of one stage will be fed into its next
2. The output variables of one stage will be fed into its next
stage as the inputs.
Args:
scope: The ``onnxconverter_common.topology.Scope`` for the model
model: A `sklearn.pipeline.Pipeline` object
inputs: A list of `onnxconverter_common.topology.Variable` objects
Returns:
A list of output `onnxconverter_common.topology.Variable`s produced by the input pipeline
:param scope: Scope object defined in _topology.py
:param model: scikit-learn pipeline object
:param inputs: A list of Variable objects
:return: A list of output variables produced by the input pipeline
"""
for step in model.steps:
inputs = _parse_sklearn_api(scope, step[1], inputs)
return inputs
def _build_sklearn_api_parsers_map():
from sklearn import pipeline
def _parse_sklearn_feature_union(scope, model, inputs):
"""
Taken from https://github.com/onnx/sklearn-onnx/blob/9939c089a467676f4ffe9f3cb91098c4841f89d8/skl2onnx/_parse.py#L199.
:param scope: Scope object
:param model: A scikit-learn FeatureUnion object
:param inputs: A list of Variable objects
:return: A list of output variables produced by feature union
"""
# Output variable name of each transform. It's a list of string.
transformed_result_names = []
# Encode each transform as our IR object
for name, transform in model.transformer_list:
transformed_result_names.append(_parse_sklearn_single_model(scope, transform, inputs)[0])
if model.transformer_weights is not None and name in model.transformer_weights:
transform_result = [transformed_result_names.pop()]
# Create a Multiply node
multiply_operator = scope.declare_local_operator("SklearnMultiply")
multiply_operator.inputs = transform_result
multiply_operator.operand = model.transformer_weights[name]
multiply_output = scope.declare_local_variable("multiply_output")
multiply_operator.outputs.append(multiply_output)
transformed_result_names.append(multiply_operator.outputs[0])
# Create a Concat operator
concat_operator = scope.declare_local_operator("SklearnConcat")
concat_operator.inputs = transformed_result_names
# Declare output name of scikit-learn FeatureUnion
union_name = scope.declare_local_variable("union")
concat_operator.outputs.append(union_name)
return concat_operator.outputs
def _parse_sklearn_column_transformer(scope, model, inputs):
"""
Taken from https://github.com/onnx/sklearn-onnx/blob/9939c089a467676f4ffe9f3cb91098c4841f89d8/skl2onnx/_parse.py#L238.
:param scope: Scope object
:param model: A *scikit-learn* *ColumnTransformer* object
:param inputs: A list of Variable objects
:return: A list of output variables produced by column transformer
"""
assert (
len(inputs) < 2
), "Hummingbird currently supports ColumnTransformer over single inputs. Please fill an issue at https://github.com/microsoft/hummingbird."
# Output variable name of each transform. It's a list of string.
transformed_result_names = []
# Encode each transform as our IR object
for name, op, column_indices in model.transformers_:
if op == "drop":
continue
if isinstance(column_indices, slice):
column_indices = list(
range(
column_indices.start if column_indices.start is not None else 0,
column_indices.stop,
column_indices.step if column_indices.step is not None else 1,
)
)
elif isinstance(column_indices, (int, str)):
column_indices = [column_indices]
pt_var, pt_is = _get_column_indices(column_indices, inputs)
transform_inputs = []
tr_inputs = _fetch_input_slice(scope, [inputs[pt_var]], pt_is)
transform_inputs.extend(tr_inputs)
model_obj = model.named_transformers_[name]
if isinstance(model_obj, str):
if model_obj == "passthrough":
var_out = transform_inputs[0]
elif model_obj == "drop":
var_out = None
else:
raise RuntimeError(
"Unknown operator alias " "'{0}'. These are specified in " "supported.py." "".format(model_obj)
)
else:
var_out = _parse_sklearn_api(scope, model_obj, transform_inputs)[0]
if model.transformer_weights is not None and name in model.transformer_weights:
# Create a Multiply node
multiply_operator = scope.declare_local_operator("SklearnMultiply")
multiply_operator.inputs.append(var_out)
multiply_operator.operand = model.transformer_weights[name]
var_out = scope.declare_local_variable("multiply_output")
multiply_operator.outputs.append(var_out)
if var_out:
transformed_result_names.append(var_out)
# Create a Concat node
if len(transformed_result_names) > 1:
concat_operator = scope.declare_local_operator("SklearnConcat")
concat_operator.inputs = transformed_result_names
# Declare output name of scikit-learn ColumnTransformer
transformed_column_name = scope.declare_local_variable("transformed_column")
concat_operator.outputs.append(transformed_column_name)
return concat_operator.outputs
return transformed_result_names
def _build_sklearn_api_parsers_map():
# Parsers for edge cases are going here.
map_parser = {
pipeline.Pipeline: _parse_sklearn_pipeline
# More will go here as added.
ColumnTransformer: _parse_sklearn_column_transformer,
pipeline.Pipeline: _parse_sklearn_pipeline,
pipeline.FeatureUnion: _parse_sklearn_feature_union,
# More parsers will go here
}
return map_parser
@ -236,9 +338,7 @@ def _parse_onnx_single_operator(scope, operator):
this_operator = scope.declare_local_operator(alias, operator)
# Register the operator's inputs.
# LinkedNode uses dictionaries and with Python 3.5 the order is not deterministic.
input_names = list(operator.input.keys())
input_names.sort()
input_names = list(operator.origin.input)
this_operator.inputs = [scope.variables[in_] for in_ in input_names if in_ in scope.variables]
# Register the operator's outpurs.
@ -283,6 +383,81 @@ def _remove_zipmap(node_list):
return output_node_list
def _fetch_input_slice(scope, inputs, column_indices):
"""
Taken from https://github.com/onnx/sklearn-onnx/blob/9939c089a467676f4ffe9f3cb91098c4841f89d8/skl2onnx/_parse.py#L53.
"""
if not isinstance(inputs, list):
raise TypeError("Parameter inputs must be a list.")
if len(inputs) == 0:
raise RuntimeError("Operator ArrayFeatureExtractor requires at least one inputs.")
if len(inputs) != 1:
raise RuntimeError("Operator ArrayFeatureExtractor does not support multiple input tensors.")
array_feature_extractor_operator = scope.declare_local_operator("SklearnArrayFeatureExtractor")
array_feature_extractor_operator.inputs = inputs
array_feature_extractor_operator.column_indices = column_indices
output_variable_name = scope.declare_local_variable("extracted_feature_columns", inputs[0].type)
array_feature_extractor_operator.outputs.append(output_variable_name)
return array_feature_extractor_operator.outputs
def _get_column_index(i, inputs):
"""
Taken from https://github.com/onnx/sklearn-onnx/blob/9939c089a467676f4ffe9f3cb91098c4841f89d8/skl2onnx/common/utils.py#L50.
Returns a tuples (variable index, column index in that variable).
The function has two different behaviours, one when *i* (column index)
is an integer, another one when *i* is a string (column name).
If *i* is a string, the function looks for input name with this name and returns (index, 0).
If *i* is an integer, let's assume first we have two inputs
*I0 = FloatTensorType([None, 2])* and *I1 = FloatTensorType([None, 3])*,
in this case, here are the results:
::
get_column_index(0, inputs) -> (0, 0)
get_column_index(1, inputs) -> (0, 1)
get_column_index(2, inputs) -> (1, 0)
get_column_index(3, inputs) -> (1, 1)
get_column_index(4, inputs) -> (1, 2)
"""
if isinstance(i, int):
if i == 0:
# Useful shortcut, skips the case when end is None
# (unknown dimension)
return 0, 0
vi = 0
return (vi, i)
else:
raise RuntimeError("Hummingbird currently support only int columns, {} is not supported.".format(i))
def _get_column_indices(indices, inputs):
"""
Taken from https://github.com/onnx/sklearn-onnx/blob/9939c089a467676f4ffe9f3cb91098c4841f89d8/skl2onnx/common/utils.py#L105.
Returns the requested graph inpudes based on their indices or names. See `_parse._get_column_index`.
Args:
indices: variables indices or names
inputs: model inputs
Returns:
a tuple *(variable name, list of requested indices)* if *multiple* is False, a dictionary *{ var_index: [ list of
requested indices ] }* if *multiple* is True
"""
pt_var = None
pt_is = []
for p in indices:
pt_v, pt_i = _get_column_index(p, inputs)
pt_is.append(pt_i)
if pt_var is None:
pt_var = pt_v
elif pt_var != pt_v:
raise NotImplementedError(
"Hummingbird is not able to merge multiple columns from "
"multiple variables ({0}). You should think about merging "
"initial types.".format([pt_var, pt_v])
)
return pt_var, pt_is
# Registered API parsers.
if sklearn_installed():
sklearn_api_parsers_map = _build_sklearn_api_parsers_map()

Просмотреть файл

@ -93,10 +93,17 @@ def convert(topology, backend, device, extra_config={}):
if output_model_name is None:
output_model_name = str(uuid4().hex) + ".onnx"
# Put the tracing test input into the right format.
trace_input = extra_config[constants.TEST_INPUT]
if type(trace_input) is tuple:
trace_input = tuple([torch.from_numpy(i) for i in trace_input])
else:
trace_input = torch.from_numpy(trace_input)
# Generate the ONNX models
torch.onnx.export(
torch_model,
torch.from_numpy(extra_config[constants.TEST_INPUT]),
trace_input,
output_model_name,
input_names=topology.raw_model.input_names,
output_names=topology.raw_model.output_names,

Просмотреть файл

@ -100,6 +100,17 @@ def xgboost_installed():
return True
def pandas_installed():
"""
Checks that *Pandas* is available.
"""
try:
import pandas
except ImportError:
return False
return True
class _Constants(object):
"""
Class enabling the proper definition of constants.

Просмотреть файл

@ -163,9 +163,9 @@ def _convert_onnxml(model, backend, test_input, device, extra_config={}):
type(initial_types[0][1])
)
)
else:
extra_config[constants.N_FEATURES] = np.array(test_input).shape[1]
extra_config[constants.TEST_INPUT] = test_input
extra_config[constants.TEST_INPUT] = test_input
elif constants.N_FEATURES not in extra_config:
extra_config[constants.N_FEATURES] = test_input.shape[1]
# Set the initializers. Some converter requires the access to initializers.
initializers = {} if model.graph.initializer is None else {in_.name: in_ for in_ in model.graph.initializer}
@ -219,6 +219,14 @@ def convert(model, backend, test_input=None, device="cpu", extra_config={}):
if test_input is not None and constants.TEST_INPUT not in extra_config:
extra_config[constants.TEST_INPUT] = test_input
# Fix the test_input type
if constants.TEST_INPUT in extra_config:
if type(extra_config[constants.TEST_INPUT]) == list:
extra_config[constants.TEST_INPUT] = np.array(extra_config[constants.TEST_INPUT])
elif type(extra_config[constants.TEST_INPUT]) == tuple:
extra_config[constants.N_FEATURES] = len(extra_config[constants.TEST_INPUT])
test_input = extra_config[constants.TEST_INPUT]
# We do some normalization on backends.
backend = backend.lower()
backend = backends[backend]

Просмотреть файл

@ -32,6 +32,7 @@ from .sklearn import iforest # noqa: E402
from .sklearn import linear as sklearn_linear # noqa: E402
from .sklearn import normalizer as sklearn_normalizer # noqa: E402
from .sklearn import one_hot_encoder as sklearn_ohe # noqa: E402
from .sklearn import pipeline # noqa: E402
from .sklearn import scaler as sklearn_scaler # noqa: E402
from .sklearn import sv # noqa: E402
from . import lightgbm # noqa: E402

Просмотреть файл

@ -25,6 +25,8 @@ class Cast(BaseOperator, torch.nn.Module):
self.to_type = to_type
def forward(self, x):
if self.to_type == 1: # Cast to float
return x.float()
if self.to_type == 7: # Cast to long
return x.long()
@ -34,7 +36,10 @@ class Concat(BaseOperator, torch.nn.Module):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)
if len(x[0].shape) > 1:
return torch.cat(x, dim=1)
else:
return torch.stack(x, dim=1)
class Reshape(BaseOperator, torch.nn.Module):

Просмотреть файл

@ -30,7 +30,7 @@ def convert_sklearn_linear_model(operator, device, extra_config):
"""
classes = [0] if not hasattr(operator.raw_operator, "classes_") else operator.raw_operator.classes_
if not all([type(x) in [int, np.int32, np.int64] for x in classes]):
if not all(["int" in str(type(x)) for x in classes]):
raise RuntimeError(
"Hummingbird currently supports only integer labels for class labels. Please file an issue at https://github.com/microsoft/hummingbird."
)

Просмотреть файл

@ -0,0 +1,102 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""
Converters for operators necessary for supporting scikit-learn Pipelines.
"""
import numpy as np
from onnxconverter_common.registration import register_converter
import torch
from .. import constants
from .._array_feature_extractor_implementations import ArrayFeatureExtractor
from .._base_operator import BaseOperator
class Concat(BaseOperator, torch.nn.Module):
"""
Module used to concatenate tensors into a single tensor.
"""
def __init__(self):
super(Concat, self).__init__()
def forward(self, *x):
return torch.cat(x, dim=1)
class Multiply(BaseOperator, torch.nn.Module):
"""
Module used to multiply features in a pipeline by a score.
"""
def __init__(self, score):
super(Multiply, self).__init__()
self.score = score
def forward(self, x):
return x * self.score
def convert_sklearn_array_feature_extractor(operator, device, extra_config):
"""
Converter for ArrayFeatureExtractor.
Args:
operator: An operator wrapping a ArrayFeatureExtractor operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
"""
assert operator is not None
indices = operator.column_indices
return ArrayFeatureExtractor(np.ascontiguousarray(indices), device)
def convert_sklearn_concat(operator, device=None, extra_config={}):
"""
Converter for concat operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
"""
return Concat()
def convert_sklearn_multiply(operator, device=None, extra_config={}):
"""
Converter for multiply operators injected when parsing Sklearn pipelines.
Args:
operator: An empty operator
device: String defining the type of device the converted operator should be run on
extra_config: Extra configuration used to select the best conversion strategy
Returns:
A PyTorch model
"""
assert operator is not None
assert hasattr(operator, "operand")
score = operator.operand
# Generate the model.
return Multiply(score)
register_converter("SklearnArrayFeatureExtractor", convert_sklearn_array_feature_extractor)
register_converter("SklearnConcat", convert_sklearn_concat)
register_converter("SklearnMultiply", convert_sklearn_multiply)

Просмотреть файл

@ -220,7 +220,17 @@ def _build_sklearn_api_operator_name_map():
Associate Sklearn with the operator class names.
If two scikit-learn (API) models share a single name, it means they are equivalent in terms of conversion.
"""
return {k: "Sklearn" + k.__name__ for k in sklearn_operator_list + xgb_operator_list + lgbm_operator_list}
# Pipeline ops. These are ops injected by the parser not "real" sklearn operators.
pipeline_operator_list = [
"ArrayFeatureExtractor",
"Concat",
"Multiply",
]
return {
k: "Sklearn" + k.__name__ if hasattr(k, "__name__") else k
for k in sklearn_operator_list + pipeline_operator_list + xgb_operator_list + lgbm_operator_list
}
def _build_onnxml_api_operator_name_map():

Просмотреть файл

@ -0,0 +1,50 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import unittest
from distutils.version import StrictVersion
import numpy as np
from sklearn.datasets import load_digits, load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import hummingbird.ml
class TestSklearnFeatureUnion(unittest.TestCase):
def test_feature_union_default(self):
data = load_iris()
X, y = data.data, data.target
X = X.astype(np.float32)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5, random_state=42)
model = FeatureUnion([("standard", StandardScaler()), ("minmax", MinMaxScaler())]).fit(X_train)
torch_model = hummingbird.ml.convert(model, "torch")
np.testing.assert_allclose(
model.transform(X_test), torch_model.transform(X_test), rtol=1e-06, atol=1e-06,
)
def test_feature_union_transformer_weights(self):
data = load_iris()
X, y = data.data, data.target
X = X.astype(np.float32)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5, random_state=42)
model = FeatureUnion(
[("standard", StandardScaler()), ("minmax", MinMaxScaler())], transformer_weights={"standard": 2, "minmax": 4}
).fit(X_train)
torch_model = hummingbird.ml.convert(model, "torch")
np.testing.assert_allclose(
model.transform(X_test), torch_model.transform(X_test), rtol=1e-06, atol=1e-06,
)
if __name__ == "__main__":
unittest.main()

Просмотреть файл

@ -0,0 +1,417 @@
import unittest
import numpy as np
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import hummingbird.ml
from hummingbird.ml._utils import pandas_installed
if pandas_installed():
import pandas
class TestSklearnPipeline(unittest.TestCase):
def test_pipeline(self):
data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32)
scaler = StandardScaler()
scaler.fit(data)
model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06,
)
def test_pipeline2(self):
data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
scaler = StandardScaler()
scaler.fit(data)
model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06,
)
def test_combine_inputs_union_in_pipeline(self):
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
model = Pipeline(
[
("scaler1", StandardScaler()),
("union", FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())])),
]
)
model.fit(data)
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06,
)
def test_combine_inputs_floats_ints(self):
data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
scaler = StandardScaler()
scaler.fit(data)
model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_1(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1, 2] # ["vA", "vB", "vC"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)])
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1, 2] # ["vA", "vB", "vC"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_weights(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1, 2] # ["vA", "vB", "vC"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
transformer_weights={"num": 2, "cat": 3},
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_drop(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1] # ["vA", "vB"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
transformer_weights={"num": 2, "cat": 3},
remainder="drop",
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_drop_noweights(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1] # ["vA", "vB"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
remainder="drop",
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19")
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_passthrough(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1] # ["vA", "vB"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
transformer_weights={"num": 2, "cat": 3},
remainder="passthrough",
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19")
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_passthrough_noweights(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = [0, 1] # ["vA", "vB"]
categorical_features = [3, 4] # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
@unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19")
@unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
def test_pipeline_column_transformer_passthrough_slice(self):
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target
X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
y_train = y % 2
numeric_features = slice(0, 1) # ["vA", "vB"]
categorical_features = slice(3, 4) # ["vcat", "vcat2"]
classifier = LogisticRegression(
C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3,
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
transformer_weights={"num": 2, "cat": 3},
remainder="passthrough",
)
model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
X_test = X_train[:11]
torch_model = hummingbird.ml.convert(model, "torch")
self.assertTrue(torch_model is not None)
np.testing.assert_allclose(
model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06,
)
if __name__ == "__main__":
unittest.main()