Integrating LotusIR and a few CNTK-ONNX fixes

2017-11-15 21:44:46 -08:00 · 2017-11-15 21:44:46 -08:00 · 578c1898a0
--- a/3
+++ b/3
@ -486,7 +486,7 @@ CNTKLIBRARY_COMMON_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/tensorboard/TensorBoardFileWriter.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/tensorboard/TensorBoardUtils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/protobuf/graph.pb.cc \
-	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/experiments/defs.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/activation/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/generator/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/logical/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/math/defs.cpp \
@ -494,6 +494,7 @@ CNTKLIBRARY_COMMON_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/reduction/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/rnn/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/tensor/defs.cpp \
+    $(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/defs/traditionalml/defs.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/core/constants.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/core/status.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/onnx/core/utils.cpp \
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -179,8 +179,10 @@
    <ClInclude Include="proto\onnx\core\model.h" />
    <ClInclude Include="proto\onnx\core\op.h" />
    <ClInclude Include="proto\onnx\core\opsignature.h" />
+    <ClInclude Include="proto\onnx\core\record.h" />
    <ClInclude Include="proto\onnx\core\shape_inference.h" />
    <ClInclude Include="proto\onnx\core\status.h" />
+    <ClInclude Include="proto\onnx\core\tensorutils.h" />
    <ClInclude Include="proto\onnx\core\utils.h" />
    <ClInclude Include="proto\onnx\ONNX.h" />
    <ClInclude Include="proto\onnx\ONNXToCNTK.h" />
@ -224,8 +226,9 @@
    <ClCompile Include="proto\onnx\core\opsignature.cpp" />
    <ClCompile Include="proto\onnx\core\shape_inference.cpp" />
    <ClCompile Include="proto\onnx\core\status.cpp" />
+    <ClCompile Include="proto\onnx\core\tensorutils.cpp" />
    <ClCompile Include="proto\onnx\core\utils.cpp" />
-    <ClCompile Include="proto\onnx\defs\experiments\defs.cpp" />
+    <ClCompile Include="proto\onnx\defs\activation\defs.cpp" />
    <ClCompile Include="proto\onnx\defs\generator\defs.cpp" />
    <ClCompile Include="proto\onnx\defs\logical\defs.cpp" />
    <ClCompile Include="proto\onnx\defs\math\defs.cpp" />
@ -233,6 +236,7 @@
    <ClCompile Include="proto\onnx\defs\reduction\defs.cpp" />
    <ClCompile Include="proto\onnx\defs\rnn\defs.cpp" />
    <ClCompile Include="proto\onnx\defs\tensor\defs.cpp" />
+    <ClCompile Include="proto\onnx\defs\traditionalml\defs.cpp" />
    <ClCompile Include="proto\onnx\ONNX.cpp" />
    <ClCompile Include="proto\onnx\ONNXToCNTK.cpp" />
    <ClCompile Include="proto\onnx\Operators.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -76,14 +76,20 @@
    <ClCompile Include="proto\onnx\core\utils.cpp">
      <Filter>proto\onnx\core</Filter>
    </ClCompile>
-    <ClCompile Include="proto\onnx\defs\experiments\defs.cpp">
-      <Filter>proto\onnx\defs\experiments</Filter>
-    </ClCompile>
    <ClCompile Include="proto\onnx\defs\generator\defs.cpp">
      <Filter>proto\onnx\defs\generator</Filter>
    </ClCompile>
-    <ClCompile Include="proto\onnx\defs\logical\defs.cpp">
-      <Filter>proto\onnx\defs\logical</Filter>
+    <ClCompile Include="proto\onnx\defs\tensor\defs.cpp">
+      <Filter>proto\onnx\defs\tensor</Filter>
+    </ClCompile>
+    <ClCompile Include="proto\onnx\core\tensorutils.cpp">
+      <Filter>proto\onnx\core</Filter>
+    </ClCompile>
+    <ClCompile Include="proto\onnx\defs\traditionalml\defs.cpp">
+      <Filter>proto\onnx\defs\traditionalml</Filter>
+    </ClCompile>
+    <ClCompile Include="proto\onnx\defs\activation\defs.cpp">
+      <Filter>proto\onnx\defs\activation</Filter>
    </ClCompile>
    <ClCompile Include="proto\onnx\defs\math\defs.cpp">
      <Filter>proto\onnx\defs\math</Filter>
@ -97,8 +103,8 @@
    <ClCompile Include="proto\onnx\defs\rnn\defs.cpp">
      <Filter>proto\onnx\defs\rnn</Filter>
    </ClCompile>
-    <ClCompile Include="proto\onnx\defs\tensor\defs.cpp">
-      <Filter>proto\onnx\defs\tensor</Filter>
+    <ClCompile Include="proto\onnx\defs\logical\defs.cpp">
+      <Filter>proto\onnx\defs\logical</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
@ -169,6 +175,12 @@
    <ClInclude Include="proto\onnx\core\utils.h">
      <Filter>proto\onnx\core</Filter>
    </ClInclude>
+    <ClInclude Include="proto\onnx\core\record.h">
+      <Filter>proto\onnx\core</Filter>
+    </ClInclude>
+    <ClInclude Include="proto\onnx\core\tensorutils.h">
+      <Filter>proto\onnx\core</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
@ -192,9 +204,6 @@
    <Filter Include="proto\onnx\defs">
      <UniqueIdentifier>{cb1e39c1-bd5e-4d7f-8c83-39de28e70307}</UniqueIdentifier>
    </Filter>
-    <Filter Include="proto\onnx\defs\experiments">
-      <UniqueIdentifier>{6168d648-8e32-4ad7-b16d-78d6a2e7a461}</UniqueIdentifier>
-    </Filter>
    <Filter Include="proto\onnx\defs\generator">
      <UniqueIdentifier>{e52f27a1-b8c4-4d67-874f-51aa43b04815}</UniqueIdentifier>
    </Filter>
@ -216,6 +225,12 @@
    <Filter Include="proto\onnx\defs\tensor">
      <UniqueIdentifier>{deb73515-13a1-4926-b5fc-e8c6e97f7784}</UniqueIdentifier>
    </Filter>
+    <Filter Include="proto\onnx\defs\activation">
+      <UniqueIdentifier>{2b7c76e2-1171-4fdc-9c7e-3012124dae52}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="proto\onnx\defs\traditionalml">
+      <UniqueIdentifier>{93c3d21c-112b-43df-b399-5e87b60020b1}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <Proto Include="proto\CNTK.proto">
--- a/Source/CNTKv2LibraryDll/proto/onnx/CNTKToONNX.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/CNTKToONNX.cpp
@ -6,6 +6,7 @@
 #include "CNTKToONNX.h"
 #include "proto/onnx/core/model.h"
 #include "proto/onnx/core/graph.h"
+#include "proto/onnx/core/status.h"

 #include "Utils.h"
 #include "Operators.h"
@ -166,9 +167,9 @@ std::unique_ptr<ONNXIR::Model> CNTKToONNX::CreateModel(const FunctionPtr& src)
    std::unique_ptr<ONNXIR::Model> model(new ONNXIR::Model("CNTKGraph", true));
    auto dstGraph = model->MainGraph();
    CNTKToONNXHelper::Copy(src, dstGraph);
-    ONNXIR::Status status = dstGraph->Resolve();
+    ONNXIR::Common::Status status = dstGraph->Resolve();
    if (!status.Ok())
-        LogicError("%s", status.ErrorMsg().c_str());
+        LogicError("%s", status.ErrorMessage().c_str());
    model->SetIrVersion(static_cast<ONNXIR::VERSION>(CNTK_ONNX_MODEL_VERSION));
    return model;
 }
@ -242,7 +243,7 @@ int CNTKToONNXHelper::ToIndex(const Axis& axis)
    if (axis.IsBatchAxis())
        return 0;
 
-    return axis.StaticAxisIndex() + 1;
+    return axis.StaticAxisIndex();
 }

 ONNXIR::TypeProto CNTKToONNXHelper::ToTypeProto(const NDShape& shape, bool hasBatchAxis)
@ -670,7 +671,7 @@ void CNTKToONNXHelper::CopyAttributes(const FunctionPtr& src, ONNXIR::Node* node
            node->AddAttribute(attributesMap[L"reductionKeepDimensions"], keepReducedDimensions);
            node->AddAttribute("axes", ToINTS(reductionAxes));
        }
-        else if (src->OpName() == L"Transpose")
+        else if (src->OpName() == L"TransposeAxes")
        {
            std::vector<Axis> perm = AsVector<Axis>(src->Attributes()[L"axisVec"].Value<std::vector<DictionaryValue>>());
            node->AddAttribute(attributesMap[L"axisVec"], ToINTS(perm));
@ -705,8 +706,8 @@ void CNTKToONNXHelper::CopyAttributes(const FunctionPtr& src, ONNXIR::Node* node
            }

            node->AddAttribute(attributesMap[L"axes"], ToINTS(sliceAxes));
-            node->AddAttribute(attributesMap[L"starts"], ToINTS(beginIndex));
-            node->AddAttribute(attributesMap[L"ends"], ToINTS(endIndex));
+            node->AddAttribute(attributesMap[L"beginIndexVec"], ToINTS(beginIndex));
+            node->AddAttribute(attributesMap[L"endIndexVec"], ToINTS(endIndex));
        }
        else if (src->OpName() == L"Softmax")
        {
--- a/Source/CNTKv2LibraryDll/proto/onnx/ONNX.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/ONNX.cpp
@ -10,6 +10,7 @@
 #include "Utils.h"

 #include <iostream>
+#include <memory>

 #include "ONNXToCNTK.h"

@ -54,22 +55,16 @@ void ONNXFormat::Save(const FunctionPtr& src, const std::wstring& filepath)

 FunctionPtr ONNXFormat::Load(const std::wstring& filepath, const DeviceDescriptor& computeDevice)
 {
-    ONNXIR::ModelProto modelProto;
+    std::shared_ptr<ONNXIR::Model> model;

 #ifdef _WIN32
-    bool loadStatus = ONNXIR::Model::Load(filepath, &modelProto);
+    Status loadStatus = ONNXIR::Model::Load(filepath, &model);
 #else
-    bool loadStatus = ONNXIR::Model::Load(ToString(filepath), &modelProto);
+    Status loadStatus = ONNXIR::Model::Load(ToString(filepath), &model);
 #endif
-    loadStatus;
-    //if (!loadStatus)
-    //    LogicError("Failed to load the model.");
+    if (!loadStatus.Ok())
+        LogicError("Failed to load the model.");

-    ONNXIR::Model model(modelProto);
-    auto status = model.MainGraph()->Resolve();
-    if (!status.Ok())
-        LogicError("%s", status.ErrorMsg().c_str());
-
-    FunctionPtr cntkFunction = ONNXToCNTK::CreateGraph(model.MainGraph(), computeDevice);
+    FunctionPtr cntkFunction = ONNXToCNTK::CreateGraph(model->MainGraph(), computeDevice);
    return cntkFunction;
 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
@ -73,6 +73,9 @@ private:
    static size_t GetNamedAttributeAsInt64(const Node *node, const string &attributeName);
    static size_t GetNamedAttributeAsInt64(const Node *node, const string &attributeName, size_t defaultValue);

+    static std::vector<int> VecInt64ToVecInt(const std::vector<int64_t> &vecInt64);
+    static std::vector<int64_t> VecIntToVecInt64(const std::vector<int> &vecInt);
+
    static float GetNamedAttributeAsFloat(const Node *node, const string &attributeName);
    static float GetNamedAttributeAsFloat(const Node *node, const string &attributeName, float defaultValue);

@ -101,12 +104,22 @@ private:

 std::vector<Axis> ONNXToCNTKHelper::AttributeProtoToAxes(const AttributeProto &attributeProto)
 {
-    std::vector<int64_t> ints(attributeProto.ints().begin(), attributeProto.ints().end());
    std::vector<Axis> axes;
+    std::vector<int64_t> ints(attributeProto.ints().begin(), attributeProto.ints().end());
+    // axes may get saved as collection or a single
+    // int CNTKToONNXHelper::ToIndex(const Axis& axis) applies axis.StaticAxisIndex() + 1
+    // to get index for ONNX. Deduct by one to get index in CNTK
+    if (!ints.empty())
+    {
        for (std::vector<int64_t>::const_iterator it = ints.begin(); it != ints.end(); it++)
        {
            axes.push_back(Axis((int)(*it)));
        }
+    }
+    else
+    {
+        axes.push_back(Axis((int)(attributeProto.i())));
+    }
    return axes;
 }

@ -172,7 +185,7 @@ DataType ONNXToCNTKHelper::FromONNXType(ONNXIR::TypeProto type)
    }
 }

-// helpers copied from Lotus (Converter.cc). These functions will eventually 
+// helpers copied from ONNXIR (Converter.cc). These functions will eventually 
 // be replaced with functionalities of onnx core.
 bool IsLittleEndianOrder()
 {
@ -479,7 +492,7 @@ std::vector<Axis> ONNXToCNTKHelper::GetNamedAttributeAsAxis(const Node *node, co
 std::vector<Axis> ONNXToCNTKHelper::GetNamedAttributeAsAxis(const Node *node, const string &attributeName,
    const std::vector<Axis> &defaultAxes)
 {
-    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, true);
+    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, false);
    if (itValue == node->GetAttributes().end())
    {
        return defaultAxes;
@ -582,7 +595,7 @@ float ONNXToCNTKHelper::GetNamedAttributeAsFloat(const Node *node, const string

 string ONNXToCNTKHelper::GetNamedAttributeAsString(const Node *node, const string &attributeName)
 {
-    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, false);
+    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, true);
    const AttributeProto &attributeProto = itValue->second;
    return attributeProto.s();
 }
@ -600,7 +613,7 @@ string ONNXToCNTKHelper::GetNamedAttributeAsString(const Node *node, const strin

 std::vector<int64_t> ONNXToCNTKHelper::GetNamedAttributeAsInt64Vec(const Node *node, const string &attributeName)
 {
-    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, false);
+    NodeAttributes::const_iterator itValue = FindAttributeIterator(node, attributeName, true);
    const AttributeProto &attributeProto = itValue->second;
    std::vector<int64_t> intVector(attributeProto.ints().begin(), attributeProto.ints().end());
    return intVector;
@ -619,11 +632,33 @@ std::vector<int64_t> ONNXToCNTKHelper::GetNamedAttributeAsInt64Vec(const Node *n
    return intVector;
 }

+std::vector<int> ONNXToCNTKHelper::VecInt64ToVecInt(const std::vector<int64_t> &vecInt64)
+{
+    std::vector<int> vecInt(vecInt64.size());
+    for (int i = 0; i < vecInt64.size(); i++)
+    {
+        vecInt[i] = static_cast<int>(vecInt64[i]);
+    }
+
+    return vecInt;
+}
+
+std::vector<int64_t> ONNXToCNTKHelper::VecIntToVecInt64(const std::vector<int> &vecInt)
+{
+    std::vector<int64_t> vecInt64(vecInt.size());
+    for (int i = 0; i < vecInt.size(); i++)
+    {
+        vecInt64[i] = vecInt[i];
+    }
+
+    return vecInt64;
+}
+
 namespace CNTK
 {
    static void PrintGraph(FunctionPtr function, int spaces, bool useName = false)
    {
-        if (function->Inputs().size() == 0)
+        if (function->Inputs().empty())
        {
            cout << string(spaces, '.') + "(" + ToString(useName ? function->Name() : function->Uid()) + ")" + ToString(function->AsString()) << std::endl;
            return;
@ -1134,9 +1169,9 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
    else if (onnxOpName == "Concat")
    {
        std::vector<Axis> axes;
-        if (HasNamedAttribute(node, "axes"))
+        if (HasNamedAttribute(node, "axis"))
        {
-            axes = GetNamedAttributeAsAxis(node, "axes");
+            axes = GetNamedAttributeAsAxis(node, "axis");
        }
        else
        {
@ -1144,6 +1179,12 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
            axes.push_back(Axis(inputs[0].Shape().Rank() - 1));
        }

+        if (axes.empty())
+        {
+            // default axis
+            axes.push_back(Axis(0));
+        }
+
        CheckForAxes(node->Name(), axes, 1);
        FunctionPtr cntkFunction = Splice(inputs, axes[0], ToWString(node->Name()));
        return cntkFunction;
@ -1151,10 +1192,31 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
    // { L"", "Split)
    else if (onnxOpName == "Slice")
    {
-        std::vector<Axis> axes = GetNamedAttributeAsAxis(node, "axes");
-        std::vector<int> beginIndex;
-        std::vector<int> endIndex;
-        FunctionPtr cntkFunction = Slice(inputs[0], axes, beginIndex, endIndex, ToWString(node->Name()));
+        // axes is optional so provide a default
+        std::vector<Axis> axes;
+        axes = GetNamedAttributeAsAxis(node, "axes", axes);
+        std::vector<int64_t> starts64 = GetNamedAttributeAsInt64Vec(node, "starts");
+        std::vector<int64_t> ends64 = GetNamedAttributeAsInt64Vec(node, "ends"); 
+
+        if (starts64.size() != ends64.size())
+        {
+            LogicError("starts (of size %d) and ends (of size %d) attributes of Slice operation must be the same size.", 
+                (int)starts64.size(), (int)ends64.size());
+        }
+
+        std::vector<int> starts = VecInt64ToVecInt(starts64);
+        std::vector<int> ends = VecInt64ToVecInt(ends64);
+
+        if (axes.empty())
+        {
+            for (int i = 0; i < starts.size(); i++)
+            {
+                Axis axis(i);
+                axes.push_back(axis);
+            }
+        }
+
+        FunctionPtr cntkFunction = Slice(inputs[0], axes, starts, ends, ToWString(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "Transpose")
@ -1348,7 +1410,7 @@ FunctionPtr ONNXToCNTK::CreateGraph(ONNXIR::Graph* src, const DeviceDescriptor&
        functions.push_back(constructedFunctions[*it]);
    }

-    if (functions.size() == 0)
+    if (functions.empty())
    {
        return nullptr;
    }
--- a/Source/CNTKv2LibraryDll/proto/onnx/Operators.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/Operators.cpp
@ -256,12 +256,12 @@ namespace ONNX
            { L"beginIndexVec", "starts" },
            { L"endIndexVec", "ends" },
        } } },
-        { L"Transpose", { {
-            { L"Transpose", "Transpose" },
+        { L"TransposeAxes", { {
+            { L"TransposeAxes", "Transpose" },
            { L"axisVec", "perm" },
        } } },
-        { L"GatherOp", { {
-            { L"GatherOp", "Gather" },
+        { L"Gather", { {
+            { L"Gather", "Gather" },
        } } },
        // { L"", "Squeeze" },
    };
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/constants.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/constants.cpp
@ -1,23 +1,45 @@
 #include "constants.h"
+#include "utils.h"

 namespace ONNXIR
 {
-
    TypesWrapper& TypesWrapper::GetTypesWrapper()
    {
-        static TypesWrapper* types = new TypesWrapper();
-        return *types;
+        static TypesWrapper types;
+        return types;
    }

    std::unordered_set<std::string>& TypesWrapper::GetAllowedDataTypes()
    {
-        static std::unordered_set<std::string>* allowedDataTypes =
-            new std::unordered_set<std::string>({
+        static std::unordered_set<std::string> allowedDataTypes = {
            c_float16, c_float, c_double,
            c_int8, c_int16, c_int32, c_int64,
            c_uint8, c_uint16, c_uint32, c_uint64,
            c_complex64, c_complex128,
-            c_string, c_bool });
-        return *allowedDataTypes;
+            c_string, c_bool };
+        return allowedDataTypes;
+    }
+
+    TypeStringsInitializer& TypeStringsInitializer::InitializeTypeStrings()
+    {
+        static TypeStringsInitializer initTypes;
+        return initTypes;
+    }
+
+    TypeStringsInitializer::TypeStringsInitializer()
+    {
+        // Initialize TypeStrToProtoMap using common type strings.
+        for (const auto& t : m_commonTypeStrings)
+        {
+            Utils::OpUtils::ToType(t);
        }
    }
+
+    // This ensures all static objects related to type strings get initialized.
+    // TypeStringsInitializer constructor populates TypeStrToProtoMap with common type strings.
+    // TypesWrapper() gets instantiated via call to OpUtils::FromString()
+    // which calls GetTypesWrapper().
+    // Note: due to non-deterministic static initialization order, some of the type strings
+    // may have already been added via Op Registrations which use those type strings.
+    static TypeStringsInitializer& _typeStrings = TypeStringsInitializer::InitializeTypeStrings();
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/constants.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/constants.h
@ -1,5 +1,6 @@
 #include <string>
 #include <unordered_set>
+#include <vector>

 namespace ONNXIR
 {
@ -15,7 +16,6 @@ namespace ONNXIR
    {
    public:
        static TypesWrapper& GetTypesWrapper();
-
        // DataType strings. These should match the DataTypes defined in Data.proto
        const std::string c_float16 = "float16";
        const std::string c_float = "float";
@ -32,6 +32,7 @@ namespace ONNXIR
        const std::string c_complex128 = "complex128";
        const std::string c_string = "string";
        const std::string c_bool = "bool";
+
        std::unordered_set<std::string>& GetAllowedDataTypes();
        ~TypesWrapper() = default;
        TypesWrapper(const TypesWrapper&) = delete;
@ -39,4 +40,26 @@ namespace ONNXIR
    private:
        TypesWrapper() = default;
    };
+
+    // Singleton class used to help initialize static objects related to type strings.
+    // This is not strictly needed but allows common rich type strings to be defined here along
+    // side the data type strings above in TypesWrapper.
+    class TypeStringsInitializer
+    {
+    public:
+        static TypeStringsInitializer& InitializeTypeStrings();
+        ~TypeStringsInitializer() = default;
+        TypeStringsInitializer(const TypeStringsInitializer&) = delete;
+        void operator=(const TypeStringsInitializer&) = delete;
+    private:
+        TypeStringsInitializer();
+        // Common string representations of TypeProto. These are used to pre-initialize
+        // typeStringToProto map. Note: some of these strings may have already been initialized in
+        // the map via op registration depending on static initialization order.
+        const std::vector<std::string> m_commonTypeStrings = { "tensor(float16)", "tensor(float)",
+            "tensor(double)", "tensor(int8)", "tensor(int16)", "tensor(int32)",
+            "tensor(int64)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)",
+            "tensor(uint64)", "tensor(complex64)", "tensor(complex128)", "tensor(string)",
+            "tensor(bool)" };
+    };
 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/graph.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/graph.cpp
@ -1,9 +1,7 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456)
-
 #include <fstream>
 #include <iostream>
 #include <numeric>
+#include <stack>

 #include "graph.h"
 #include "op.h"
@ -13,10 +11,20 @@ using namespace ONNXIR::Utils;

 namespace ONNXIR
 {
+
+#define NO_CHANGE_ON_SYNC_FLAG(...)                 \
+    do {                                            \
+        bool syncNeeded = m_graphProtoSyncNeeded;   \
+        {__VA_ARGS__;}                               \
+        m_graphProtoSyncNeeded = syncNeeded;        \
+    } while (0)                                     \
+
    NodeArg::NodeArg(const std::string& p_name,
        const TypeProto* p_nodeArgType)
    {
        m_nodeArgInfo.set_name(p_name);
+        // If the name is empty, it means the arg does not exist.
+        m_exist = !(p_name.empty());
        if (nullptr != p_nodeArgType)
        {
            (*m_nodeArgInfo.mutable_type()) = *p_nodeArgType;
@ -48,9 +56,7 @@ namespace ONNXIR
            return &(m_nodeArgInfo.type().tensor_type().shape());
        case ONNXIR::TypeProto::kSparseTensorType:
            return &(m_nodeArgInfo.type().sparse_tensor_type().shape());
-        case ONNXIR::TypeProto::kHandleType:
-        case ONNXIR::TypeProto::kTupleType:
-        case ONNXIR::TypeProto::kSeqType:
+        case ONNXIR::TypeProto::kSequenceType:
        case ONNXIR::TypeProto::kMapType:
        case ONNXIR::TypeProto::VALUE_NOT_SET:
        default:
@ -74,9 +80,7 @@ namespace ONNXIR
        case ONNXIR::TypeProto::kSparseTensorType:
            *(m_nodeArgInfo.mutable_type()->mutable_sparse_tensor_type()->mutable_shape()) = p_shape;
            break;
-        case ONNXIR::TypeProto::kHandleType:
-        case ONNXIR::TypeProto::kTupleType:
-        case ONNXIR::TypeProto::kSeqType:
+        case ONNXIR::TypeProto::kSequenceType:
        case ONNXIR::TypeProto::kMapType:
        case ONNXIR::TypeProto::VALUE_NOT_SET:
        default:
@ -107,6 +111,11 @@ namespace ONNXIR
        *(m_nodeArgInfo.mutable_type()) = p_typeProto;
    }

+    bool NodeArg::Exist() const
+    {
+        return m_exist;
+    }
+
    Function::Function(Node* p_node,
        const FunctionDefProto& p_funcProto)
    {
@ -301,19 +310,6 @@ namespace ONNXIR
        // Set doc string.
        p_proto.set_doc_string(m_description);

-        // Set control inputs.
-        p_proto.clear_control_input();
-        for (auto& control_input : m_controlInputs)
-        {
-            if (0 == m_graph->SourceNode()->Name().compare(control_input))
-            {
-                // Source node will be removed during serialization.
-                continue;
-            }
-
-            *p_proto.add_control_input() = control_input;
-        }
-
        // Set attributes.
        p_proto.clear_attribute();
        for (auto attribute : m_attributes)
@ -330,13 +326,6 @@ namespace ONNXIR
            *input = inputDef.Name();
        }

-        // Set input arg count.
-        p_proto.clear_input_arg_count();
-        for (auto argCount : m_inputArgCount)
-        {
-            *(p_proto.mutable_input_arg_count()->Add()) = argCount;
-        }
-
        // Set outputs' definitions.
        p_proto.clear_output();
        for (auto& outputDef : m_outputDefs)
@ -367,18 +356,11 @@ namespace ONNXIR
            m_inputDefs.push_back(NodeArg(p_nodeProto.input(i), type));
        }

-        for (auto argCount : p_nodeProto.input_arg_count())
-        {
-            m_inputArgCount.push_back(argCount);
-        }
-        if (0 == m_inputArgCount.size())
-        {
-            // Input arg count information is not provided in proto file.
-            // Set each arg count as 1 by default.
-            // It could be adjusted when resolving the node with its operator
-            // information.
+        // Set input arg count as 1:1 maping with input defs.
+        // NOTE: it may be refined per operator definition.
+        // There will be cases having arg count as, 1, 1, ..., 1, N.
+        // It means that the last operator input is variadic.
        m_inputArgCount.assign(m_inputDefs.size(), 1);
-        }

        for (int i = 0; i < p_nodeProto.output().size(); ++i)
        {
@ -395,11 +377,6 @@ namespace ONNXIR
            m_outputDefs.push_back(NodeArg(p_nodeProto.output(i), type));
        }

-        for (auto control_input : p_nodeProto.control_input())
-        {
-            m_controlInputs.insert(control_input);
-        }
-
        for (int i = 0; i < p_nodeProto.attribute_size(); ++i)
        {
            auto& attr = p_nodeProto.attribute(i);
@ -534,16 +511,12 @@ namespace ONNXIR
    ADD_BASIC_ATTR_IMPL(int64_t, i)
    ADD_BASIC_ATTR_IMPL(std::string, s)
    ADD_ATTR_IMPL(TensorProto, t)
-    ADD_ATTR_IMPL(TypeProto::TensorShapeProto, shape)
    ADD_ATTR_IMPL(GraphProto, g)
-    ADD_ATTR_IMPL(TypeProto, type)
    ADD_LIST_ATTR_IMPL(float, floats)
    ADD_LIST_ATTR_IMPL(int64_t, ints)
    ADD_LIST_ATTR_IMPL(std::string, strings)
    ADD_LIST_ATTR_IMPL(TensorProto, tensors)
-    ADD_LIST_ATTR_IMPL(TypeProto::TensorShapeProto, shapes)
    ADD_LIST_ATTR_IMPL(GraphProto, graphs)
-    ADD_LIST_ATTR_IMPL(TypeProto, types)

    bool Node::ClearAttribute(const std::string& p_attrName)
    {
@ -591,25 +564,13 @@ namespace ONNXIR
    Graph::Graph(const GraphProto& p_graphProto)
        : m_graphProto(p_graphProto),
        m_graphProtoSyncNeeded(false),
-        m_graphResolveNeeded(true)
+        m_graphResolveNeeded(true),
+        m_numOfNodes(0)
    {
        // This is a main graph, and strict type checking needed..
        m_graphType |= Type::Main;

-        if (m_graphProto.node_size() > 0
-            && m_graphProto.node(0).input_arg_count_size() > 0)
-        {
-            // The condition above now is used to judge
-            // whether 1) node input arg count is specified or not,
-            // to determin whether strict type checking needed or not.
-            m_graphType |= Type::Strict;
-        }
-
-        // Copy function definitions to a map.
-        for (auto funcDef : p_graphProto.function())
-        {
-            m_funcDefMap[funcDef.name()] = funcDef;
-        }
+        // TODO: add Type::Strict back.

        // Copy initial tensors to a map.
        for (auto tensor : p_graphProto.initializer())
@ -649,13 +610,13 @@ namespace ONNXIR
        {
            AddNode(nodeProto, nameToTypeMap);
        }
-
    }

    Graph::Graph(Node* p_node,
        const FunctionDefProto& p_functionProto)
        : m_graphProtoSyncNeeded(false),
-        m_graphResolveNeeded(true)
+        m_graphResolveNeeded(true),
+        m_numOfNodes(0)
    {
        // This is a function (subgraph).
        m_graphType |= (Type::Main | Type::Strict);
@ -690,7 +651,8 @@ namespace ONNXIR

    Graph::Graph(const std::string& p_name, bool p_isONNX)
        : m_graphProtoSyncNeeded(false),
-        m_graphResolveNeeded(true)
+        m_graphResolveNeeded(true),
+        m_numOfNodes(0)
    {
        m_graphProto.set_name(p_name);
        m_graphType |= Type::Main;
@ -705,7 +667,8 @@ namespace ONNXIR
    Graph::Graph(const std::string& p_name,
        const std::string& p_docString)
        : m_graphProtoSyncNeeded(false),
-        m_graphResolveNeeded(true)
+        m_graphResolveNeeded(true),
+        m_numOfNodes(0)
    {
        m_graphProto.set_name(p_name);
        m_graphProto.set_doc_string(p_docString);
@ -732,20 +695,22 @@ namespace ONNXIR
                && p_nodeNameToIndex.end() != p_nodeNameToIndex.find(nodeName))
            {
                // The node has name and its name was used by another node.
-                Status status(false,
+                Status status(ONNX,
+                    FAIL,
                    "Error: two nodes with same node name (" + nodeName + ").");
                return status;
            }
            p_nodeNameToIndex[nodeName] = (*nodeIter)->Index();

            // Verify node outputs' name should be unique.
-            for (auto& outputDef : (*nodeIter)->Mutable_OutputDefs())
+            for (auto& outputDef : (*nodeIter)->OutputDefs())
            {
                std::string outputArgname = outputDef.Name();
                if (p_outputArgs.end() != p_outputArgs.find(outputArgname))
                {
                    // Two outputs with same name.
-                    Status status(false,
+                    Status status(ONNX,
+                        FAIL,
                        "Error: two output args with same name ("
                        + outputArgname + ").");
                    return status;
@ -790,7 +755,7 @@ namespace ONNXIR
                auto nameToIndexIter = p_nodeNameToIndex.find(controlInput);
                if (p_nodeNameToIndex.end() == nameToIndexIter)
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "The control input (" + controlInput + ") of Node ("
                        + (*nodeIter)->Name() + ") does not exist in the graph.");
                    return status;
@ -811,6 +776,12 @@ namespace ONNXIR

                for (auto& inputArg : inputArgs)
                {
+                    if (!inputArg.Exist())
+                    {
+                        // This input could be optional and it does not exist in this case.
+                        continue;
+                    }
+
                    auto outputArgIter = p_outputArgs.find(inputArg.Name());
                    if (p_outputArgs.end()
                        == outputArgIter)
@ -876,45 +847,74 @@ namespace ONNXIR
    Status Graph::CheckIsAcyclic(
        std::vector<NODEINDEX>& p_nodesInTopologicalOrder)
    {
+        p_nodesInTopologicalOrder.clear();
+        // nodes that have been processed and added to p_nodesInTopologicalOrder.
        std::unordered_set<NODEINDEX> visitedNodes;
        std::unordered_set<NODEINDEX> ancestorNodes;
-        return DepthFirstAccess(ancestorNodes,
-            m_sinkNodeIndex,
-            visitedNodes,
-            p_nodesInTopologicalOrder);
+        // tracks nodes whose child nodes have been processed.
+        std::unordered_set<NODEINDEX> childrenVisitedNodes;
+        std::stack<NODEINDEX> stack;
+        stack.push(m_sinkNodeIndex);
+
+        while (!stack.empty())
+        {
+            NODEINDEX current = stack.top();
+            stack.pop();
+
+            if (visitedNodes.end() != visitedNodes.find(current))
+            {
+                // The node has been visited before
+                continue;
            }

-    Status Graph::DepthFirstAccess(std::unordered_set<NODEINDEX> p_ancestors,
-        NODEINDEX p_current,
-        std::unordered_set<NODEINDEX>& p_visitedNodes,
-        std::vector<NODEINDEX>& p_nodesInTopologicalOrder)
+            if (childrenVisitedNodes.end() != childrenVisitedNodes.find(current))
            {
-        if (p_visitedNodes.end() != p_visitedNodes.find(p_current))
-        {
-            // The node has been visited before.
-            return Status::OK();
+                // children are done so we mark this one complete.
+                visitedNodes.insert(current);
+                p_nodesInTopologicalOrder.push_back(current);
+                ancestorNodes.erase(current);
+                continue;
            }

-        p_ancestors.insert(p_current);
-        for (auto iter = m_nodes[p_current]->InputNodes_begin();
-            iter != m_nodes[p_current]->InputNodes_end();
+            if (m_nodes[current]->InputNodes_begin() ==
+                m_nodes[current]->InputNodes_end())
+            {
+                // no children
+                childrenVisitedNodes.insert(current);
+                visitedNodes.insert(current);
+                p_nodesInTopologicalOrder.push_back(current);
+                ancestorNodes.erase(current);
+                continue;
+            }
+
+            stack.push(current);
+
+            // mark as children done. by the time the node is popped off the stack again,
+            // its children will have been processed
+            childrenVisitedNodes.insert(current);
+
+            ancestorNodes.insert(current);
+
+            // check children
+            for (auto iter = m_nodes[current]->InputNodes_begin();
+                iter != m_nodes[current]->InputNodes_end();
                ++iter)
            {
-            if (p_ancestors.end() != p_ancestors.find((*iter)->Index()))
+                NODEINDEX idx = (*iter)->Index();
+                if (ancestorNodes.end() != ancestorNodes.find(idx))
                {
-                Status status(false,
+                    Status status(ONNX, FAIL,
                        "Error: the graph is not acyclic.");
                    return status;
                }

-            RETURN_IF_ERROR(DepthFirstAccess(p_ancestors,
-                (*iter)->Index(),
-                p_visitedNodes,
-                p_nodesInTopologicalOrder));
+                // avoid re-processing nodes
+                if (childrenVisitedNodes.end() == childrenVisitedNodes.find(idx))
+                {
+                    stack.push(idx);
+                }
+            }
        }
-        p_visitedNodes.insert(p_current);
-        p_nodesInTopologicalOrder.push_back(p_current);
-
        return Status::OK();
    }

@ -960,12 +960,20 @@ namespace ONNXIR
                        initialTensorType.mutable_tensor_type()->set_elem_type(
                            initialTensorIter->second.data_type());
                        inputDef.SetType(OpUtils::ToType(initialTensorType));
+
+                        // Set shape accordingly.
+                        TypeProto_TensorShapeProto shape;
+                        for (auto dim : initialTensorIter->second.dims())
+                        {
+                            shape.add_dim()->set_dim_value(dim);
+                        }
+                        inputDef.SetShape(shape);
                    }
                    else if (!inputDef.m_nodeArgInfo.has_type())
                    {
                        // This input is fed by callers and its type has to be specified.

-                        Status status(false,
+                        Status status(ONNX, FAIL,
                            "Node (" + nodeName + ") input arg ("
                            + inputDef.Name()
                            + ") does not have type information.");
@ -989,7 +997,7 @@ namespace ONNXIR
                auto iter = opFormalParameter.GetTypes().find(inputDef.Type());
                if (opFormalParameter.GetTypes().end() == iter)
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "Node (" + nodeName + ") input arg ("
                        + inputDef.Name() + ") type does not match operator ("
                        + p_op->GetName() + ") definition.");
@ -1003,12 +1011,15 @@ namespace ONNXIR
                        = inputDef.Type();

                }
-                else if (paramToTypeIter->second != inputDef.Type())
+                else if (paramToTypeIter->second != inputDef.Type() && argCount == 1)
                {
                    // This is the case.
                    // An operator's inputs' type is "T", and T"s allowed value set is "float, int32".
                    // However, one input is specified as "float", and another one is specified as "int".
-                    Status status(false,
+                    // NOTE: for variadic arguments (argCount > 1), this verification rule is not applicable.
+                    // Different types are allowed for variadic arguments although there's only one type "T"
+                    // specified in op definition.
+                    Status status(ONNX, FAIL,
                        "Node (" + nodeName + ") has different input"
                        " types (" + *(paramToTypeIter->second) + ","
                        + *(inputDef.Type()) + ") matching to same "
@ -1047,7 +1058,7 @@ namespace ONNXIR
                    = p_node->GetAttributes().find(c_constantValue);
                if (p_node->GetAttributes().end() == nodeAttributesIter)
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "Node (" + nodeName + ") output arg value should"
                        "be specified via node attribute '" + c_constantValue + "'.");
                    return status;
@ -1055,7 +1066,7 @@ namespace ONNXIR

                AttrType attrType;
                RETURN_IF_ERROR(TypeUtils::GetType(nodeAttributesIter->second, attrType));
-                if (AttrType::TENSOR == attrType)
+                if (AttrType::AttributeProto_AttributeType_TENSOR == attrType)
                {
                    auto& tensor = nodeAttributesIter->second.t();
                    TypeProto typeProto;
@ -1064,7 +1075,7 @@ namespace ONNXIR
                }
                else
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "For attribute " + c_constantValue + " , only Tensor type"
                        "is allowed. The attribute type in this model is "
                        + ONNXIR::c_attrTypeStr[(int)attrType] + ".");
@ -1082,7 +1093,7 @@ namespace ONNXIR
                auto iter = opFormalParameter.GetTypes().find(outputDef.Type());
                if (opFormalParameter.GetTypes().end() == iter)
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "Node (" + nodeName + ") output arg ("
                        + outputDef.Name() + ") type does not match operator ("
                        + p_op->GetName() + ") definition.");
@ -1103,7 +1114,7 @@ namespace ONNXIR
            // Output arg has no type information, and there're
            // multiple allowed types defined in operator definition.
            // Type inference fails in this case.
-            Status status(false,
+            Status status(ONNX, FAIL,
                "Node (" + nodeName + ") output arg ("
                + outputDef.Name() + ") type inference failed");
            return status;
@ -1137,27 +1148,16 @@ namespace ONNXIR

                // The node refers to a primitive operator.
                // Infer and verify node input arg type information.
-                size_t totalArgCount = std::accumulate(node->InputArgCount().begin(),
+                auto totalArgCount = std::accumulate(node->InputArgCount().begin(),
                    node->InputArgCount().end(), 0);
                if (totalArgCount != node->InputDefs().size())
                {
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "The sum of input arg count is not equal to size of"
                        "input defs in node (" + nodeName + ").");
                    return status;
                }

-                if (totalArgCount > op.GetOnnxMaxInput() ||
-                    totalArgCount < op.GetOnnxMinInput() ||
-                    !op.GetOnnxNumInputsAllowedFunc()(totalArgCount))
-                {
-                    // Number of inputs do not match.
-                    Status status(false, "Error: node (" + nodeName
-                        + ")'s number of inputs do not match its operator ("
-                        + op_type + ") specification.");
-                    return status;
-                }
-
                // Verify size of node arg count is same as input number in
                // operator definition.
                if (op.GetInputs().size() != node->InputArgCount().size())
@ -1202,7 +1202,7 @@ namespace ONNXIR
                    else
                    {
                        // Number of inputs do not match.
-                        Status status(false, "Error: node (" + nodeName
+                        Status status(ONNX, FAIL, "Error: node (" + nodeName
                            + ")'s number of inputs do not match its operator ("
                            + op_type + ") specification.");
                        return status;
@ -1210,29 +1210,14 @@ namespace ONNXIR
                }

                // Verify node outputs have same size with operator definition.
-                size_t outputCount = node->OutputDefs().size();
                if (op.GetOutputs().size() != node->OutputDefs().size())
+                {
+                    if (0 != (m_graphType & Type::Strict))
                    {
                        // Number of outputs do not match.
-                    Status status(false, "Error: node (" + nodeName
+                        Status status(ONNX, FAIL, "Error: node (" + nodeName
                            + ")'s number of outputs does not match its operator ("
                            + op_type + ") specification.");
-
-                    if (0 == (m_graphType & Type::Strict))
-                    {
-                        // It's ONNX case.
-                        // TODO: more understanding is still needed about ONNX
-                        // on how to distributing the output args to output formal
-                        // parameter (same as input?)
-                        if (outputCount > op.GetOnnxMaxOutput() ||
-                            outputCount < op.GetOnnxMinOutput() ||
-                            !op.GetOnnxNumOutputsAllowedFunc()(outputCount))
-                        {
-                            return status;
-                        }
-                    }
-                    else
-                    {
                        return status;
                    }
                }
@ -1240,8 +1225,7 @@ namespace ONNXIR
                if (0 != (m_graphType & Type::Strict))
                {
                    // Strict type checking needed.
-
-                    RETURN_IF_ERROR(InferAndVerifyTypeMatch(node, &op, p_outputArgs));
+                    NO_CHANGE_ON_SYNC_FLAG(RETURN_IF_ERROR(InferAndVerifyTypeMatch(node, &op, p_outputArgs)));
                }

                // Attribute verification and fill node attribute with
@ -1279,7 +1263,7 @@ namespace ONNXIR
                            RETURN_IF_ERROR(TypeUtils::GetType(nodeAttrIter->second, nodeAttrType));
                            if (nodeAttrType != attrDef.GetType())
                            {
-                                Status status(false,
+                                Status status(ONNX, FAIL,
                                    "Node (" + nodeName + ") attribute ("
                                    + nodeAttrIter->first + ") type does not match operator definition.");
                                return status;
@ -1294,9 +1278,9 @@ namespace ONNXIR
                if (m_funcDefMap.end() == funcIter)
                {
                    // A op_type refers to nothing.
-                    Status status(false,
+                    Status status(ONNX, FAIL,
                        "Error: the operator or function (" + op_type
-                        + ") referred by node (" + nodeName
+                        + ") refered by node (" + nodeName
                        + ") does not exist.");
                    return status;
                }
@ -1309,7 +1293,7 @@ namespace ONNXIR
                    != node->InputDefs().size())
                {
                    // Number of inputs do not match.
-                    Status status(false, "Error: node (" + nodeName
+                    Status status(ONNX, FAIL, "Error: node (" + nodeName
                        + ")'s number of inputs do not match its function ("
                        + op_type + ") specification.");
                    return status;
@ -1320,7 +1304,7 @@ namespace ONNXIR
                    != node->OutputDefs().size())
                {
                    // Number of outputs do not match.
-                    Status status(false, "Error: node (" + nodeName
+                    Status status(ONNX, FAIL, "Error: node (" + nodeName
                        + ")'s number of outputs do not match its function ("
                        + op_type + ") specification.");
                    return status;
@ -1342,16 +1326,14 @@ namespace ONNXIR
        std::unordered_map<std::string, NODEINDEX> nodeNameToIndex;
        RETURN_IF_ERROR(VerifyNoDuplicateName(outputArgs, nodeNameToIndex));
        RETURN_IF_ERROR(BuildConnections(outputArgs, nodeNameToIndex));
-
        RETURN_IF_ERROR(CheckIsAcyclic(m_nodesInTopologicalOrder));

-        /* TODO: Uncomment
        std::set<std::string> funcDefNames;
        RETURN_IF_ERROR(VerifyNodeAndOpMatch(m_nodesInTopologicalOrder,
            outputArgs,
            funcDefNames));
        CleanFunctionDefMap(funcDefNames);
-        */
+        SetGraphInputsOutputs();

        m_graphResolveNeeded = false;
        return Status::OK();
@ -1421,6 +1403,21 @@ namespace ONNXIR
        return m_nameToInitialTensor;
    }

+    const std::vector<const NodeArg*>& Graph::GetInputs() const
+    {
+        return m_graphInputs;
+    }
+
+    const std::vector<const NodeArg*>& Graph::GetOutputs() const
+    {
+        return m_graphOutputs;
+    }
+
+    const std::vector<const NodeArg*>& Graph::GetValueInfo() const
+    {
+        return m_valueInfo;
+    }
+
    bool Graph::AddFunctionDef(const FunctionDefProto& p_funcDef)
    {
        auto funcDefName = p_funcDef.name();
@ -1489,6 +1486,10 @@ namespace ONNXIR
    {
        auto node = AllocateNode();
        node->Init(p_name, p_opType, p_description, p_inputArgs, p_outputArgs);
+        if (0 != p_opType.compare(c_noOp))
+        {
+            m_graphProtoSyncNeeded = true;
+        }
        return node;
    }

@ -1506,6 +1507,7 @@ namespace ONNXIR
            p_inputArgs,
            p_inputArgCount,
            p_outputArgs);
+        m_graphProtoSyncNeeded = true;
        return node;
    }

@ -1519,6 +1521,7 @@ namespace ONNXIR
            p_opType,
            p_description,
            p_outputArgs);
+        m_graphProtoSyncNeeded = true;
        return node;
    }

@ -1526,6 +1529,7 @@ namespace ONNXIR
    {
        auto node = AllocateNode();
        *node = p_other;
+        m_graphProtoSyncNeeded = true;
        return node;
    }

@ -1568,8 +1572,12 @@ namespace ONNXIR
        m_nodes[p_dstNodeIndex]->
            m_controlInputs.insert(m_nodes[p_srcNodeIndex]->Name());

+        if (!IsSourceNode(p_srcNodeIndex)
+            && !IsSinkNode(p_dstNodeIndex))
+        {
            m_graphProtoSyncNeeded = true;
            m_graphResolveNeeded = true;
+        }

        return true;
    }
@ -1614,24 +1622,17 @@ namespace ONNXIR

        // Nodes.
        m_graphProto.clear_node();
-        for (auto& node : m_nodes)
+
+        // Nodes must be sorted in Topological Order in the GraphProto per ONNX spec.
+        for (auto& nodeIdx : m_nodesInTopologicalOrder)
        {
-            if (nullptr == node
-                || IsSourceNode(node->Index())
-                || IsSinkNode(node->Index()))
+            if (IsSourceNode(nodeIdx)
+                || IsSinkNode(nodeIdx))
            {
                continue;
            }
            auto nodeProto = m_graphProto.add_node();
-            node->ToProto(*nodeProto);
-        }
-
-        // Functions.
-        m_graphProto.clear_function();
-        for (auto& func : m_funcDefMap)
-        {
-            auto funcDef = m_graphProto.add_function();
-            (*funcDef) = func.second;
+            m_nodes[nodeIdx]->ToProto(*nodeProto);
        }

        // Initial tensors;
@ -1642,37 +1643,56 @@ namespace ONNXIR
            *tensor = item.second;
        }

-        // Set graph inputs/outputs.
-        // Set graph value_info.
-        SetGraphInputsOutputs();
+        // Sync graph inputs/outputs/valueInfo.
+        SyncGraphInputsOutputs();

        m_graphProtoSyncNeeded = false;

        return m_graphProto;
    }

-    void Graph::SetGraphInputsOutputs()
+    void Graph::SyncGraphInputsOutputs()
    {
        m_graphProto.clear_input();
        m_graphProto.clear_output();
        m_graphProto.clear_value_info();

-        std::unordered_map<std::string, Node::EdgeEnd> allOutputArgs;
+        for (auto inputArg : m_graphInputs)
+        {
+            *(m_graphProto.mutable_input()->Add()) = inputArg->ToProto();
+        }
+
+        for (auto outputArg : m_graphOutputs)
+        {
+            *(m_graphProto.mutable_output()->Add()) = outputArg->ToProto();
+        }
+
+        for (auto valueInfo : m_valueInfo)
+        {
+            *(m_graphProto.mutable_value_info()->Add()) = valueInfo->ToProto();
+        }
+    }
+
+    void Graph::SetGraphInputsOutputs()
+    {
+        // Reset graphInputs/graphOutputs/valueInfo state.
+        m_graphInputs.clear();
+        m_graphOutputs.clear();
+        m_valueInfo.clear();
+
+        std::unordered_map<std::string, const NodeArg*> outputNameToNodeArg;
        for (auto nodeIter = Nodes_begin();
            nodeIter != Nodes_end();
            ++nodeIter)
        {
            for (auto& outputDef : (*nodeIter)->OutputDefs())
            {
-                auto& outputArgname = outputDef.Name();
-
-                allOutputArgs.insert(
-                { outputArgname, Node::EdgeEnd(*(*nodeIter), outputDef) });
+                outputNameToNodeArg.insert({ outputDef.Name(), &outputDef });
            }
        }

        // Init graph output args with all node output args.
-        auto graphOutputArgs = allOutputArgs;
+        auto graphOutputArgs = outputNameToNodeArg;

        std::unordered_set<Node*> innerNodes;
        for (auto nodeIter = Nodes_begin();
@ -1688,14 +1708,14 @@ namespace ONNXIR
            // Go thru all node's inputs.
            for (auto& inputArg : (*nodeIter)->InputDefs())
            {
-                auto outputArgIter = allOutputArgs.find(inputArg.Name());
-                if (allOutputArgs.end()
+                auto outputArgIter = outputNameToNodeArg.find(inputArg.Name());
+                if (outputNameToNodeArg.end()
                    == outputArgIter)
                {
                    // No such outputArg matching this inputArg.
                    // This input arg should be fed when running evaluation.
                    // it should be a graph input or initializer (say, weight).
-                    *(m_graphProto.mutable_input()->Add()) = inputArg.ToProto();
+                    m_graphInputs.push_back(&inputArg);
                    continue;
                }

@ -1703,7 +1723,7 @@ namespace ONNXIR
                // feeding another node as the node's input.
                if (graphOutputArgs.erase(outputArgIter->first) >= 1)
                {
-                    *(m_graphProto.mutable_value_info()->Add()) = inputArg.ToProto();
+                    m_valueInfo.push_back(&inputArg);
                }
            }
        }
@ -1711,7 +1731,7 @@ namespace ONNXIR
        // Set graph outputs.
        for (auto& outputArg : graphOutputArgs)
        {
-            *(m_graphProto.mutable_output()->Add()) = outputArg.second.GetNodeArg()->ToProto();
+            m_graphOutputs.push_back(outputArg.second);
        }
    }

@ -1756,7 +1776,6 @@ namespace ONNXIR
        std::unique_ptr<Node> node(new Node(MaxNodeIndex(), this));
        m_nodes.push_back(std::move(node));
        m_numOfNodes++;
-        m_graphProtoSyncNeeded = true;
        m_graphResolveNeeded = true;
        return m_nodes.back().get();
    }
@ -1769,5 +1788,3 @@ namespace ONNXIR
        m_graphResolveNeeded = true;
    }
 }
-
-#pragma warning(pop)
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/graph.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/graph.h
@ -1,6 +1,3 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456)
-
 #ifndef CORE_GRAPH_GRAPH_H
 #define CORE_GRAPH_GRAPH_H

@ -9,10 +6,17 @@
 #include <unordered_set>

 #include "constants.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456 4189 4996)
 #include "proto/onnx/protobuf/graph.pb.h"
+#pragma warning(pop)
+
 #include "status.h"
 #include "utils.h"

+using namespace ONNXIR::Common;
+
 namespace ONNXIR
 {
    typedef size_t NODEINDEX;
@ -70,6 +74,11 @@ namespace ONNXIR
        // Get node arg info proto.
        const NodeArgInfo& ToProto() const;

+        // Indicates whether <*this> node arg exists or not.
+        // Optional inputs are allowed in ONNX. Empty arg name represents 
+        // a non-existing input argument.
+        bool Exist() const;
+
    private:

        friend class Node;
@ -83,6 +92,9 @@ namespace ONNXIR

        // Node arg name, type and shape.
        NodeArgInfo m_nodeArgInfo;
+
+        // Flag indicates whether <*this> node arg exists or not.
+        bool m_exist;
    };

    // Function representation.
@ -239,6 +251,9 @@ namespace ONNXIR
        ADD_ATTR_INTERFACES(TypeProto)
        ADD_ATTR_INTERFACES(TypeProto::TensorShapeProto)

+        // ValueProto doesn't have a vector interface
+        bool AddAttribute(const std::string& p_attrName, const ValueProto& p_value);
+
        // Clear specified node attribute.
        bool ClearAttribute(const std::string& p_attrName);

@ -392,7 +407,7 @@ namespace ONNXIR
        //    a. Node name and node output's names should be unique.
        //    b. Attribute match between node and op definition.
        //    c. Input/Output match between node and op definition.
-        //    d. Graph is acyclic.
+        //    d. Graph is acyclic and sort nodes in topological order.
        // 2. Check & Setup inner nodes' dependency.
        // 3. Cleanup function definition lists.
        // Returns resolving status.
@ -409,6 +424,11 @@ namespace ONNXIR
            TensorProto& p_value) const;
        const InitialTensorSet& GetAllInitialTensors() const;

+        // Get graph inputs/outputs.
+        const std::vector<const NodeArg*>& GetInputs() const;
+        const std::vector<const NodeArg*>& GetOutputs() const;
+        const std::vector<const NodeArg*>& GetValueInfo() const;
+
        // Add or Remove a function definition.
        bool AddFunctionDef(const FunctionDefProto& p_function);
        void RemoveFunctionDef(const std::string& p_functionName);
@ -522,17 +542,6 @@ namespace ONNXIR
        Status CheckIsAcyclic(
            /*out*/std::vector<NODEINDEX>& p_nodesInToplogicalOrder);

-        // Depth-first graph access.
-        // <p_ancestors> specifies all ancestor nodes of <p_current> node.
-        // <p_current> specifies current node being accessed.
-        // <p_visitedNodes> specifies nodes already visited.
-        // <p_nodesInToplogicalOrder> returns nodes' indexes in toplogical
-        // order if the graph is acyclic.
-        Status DepthFirstAccess(std::unordered_set<NODEINDEX> p_ancestors,
-            NODEINDEX p_current,
-            /*in | out*/std::unordered_set<NODEINDEX>& p_visitedNodes,
-            /*out*/std::vector<NODEINDEX>& p_nodesInToplogicalOrder);
-
        // Given nodes in toplogical order, infer and set type information
        // across <*this> graph if needed, and verify type/attribute
        // information match between node and op.
@ -546,15 +555,18 @@ namespace ONNXIR
            const std::unordered_map<std::string, Node::EdgeEnd>& p_outputArgs);

        // Clean function definition map.
-        // Remove function definitions not referred by any node.
+        // Remove function definitions not refered by any node.
        void CleanFunctionDefMap(const std::set<std::string>& p_funcDefNames);

        // Add source/sink nodes to <*this> graph.
        void AddSourceSinkNodes();

-        // Set graph inputs/outputs when serializing to proto.
+        // Set graph inputs/outputs when resolving a graph..
        void SetGraphInputsOutputs();

+        // Sync graph inputs/outputs when serializing to proto.
+        void SyncGraphInputsOutputs();
+
        // Graph nodes.
        // Element in <m_nodes> may be nullptr due to graph optimization.
        std::vector<std::unique_ptr<Node>> m_nodes;
@ -594,11 +606,18 @@ namespace ONNXIR

        int m_graphType = 0;

-        // the topologic order of node index
+        // The topologic order of node index.
        std::vector<NODEINDEX> m_nodesInTopologicalOrder;
+
+        // Graph inputs.
+        std::vector<const NodeArg*> m_graphInputs;
+
+        // Graph outputs.
+        std::vector<const NodeArg*> m_graphOutputs;
+
+        // Graph value_info.
+        std::vector<const NodeArg*> m_valueInfo;
    };
 }

 #endif  // CORE_GRAPH_GRAPH_H
-
-#pragma warning(pop)
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/model.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/model.cpp
@ -1,9 +1,14 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456 4189 4996)
-
 #include <fcntl.h>
 #include <fstream>
+#ifdef _MSC_VER
+#pragma warning(push)
+// 'type' : forcing value to bool 'true' or 'false' (performance warning)
+#pragma warning(disable: 4800)
+#endif
 #include <google/protobuf/io/coded_stream.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #ifdef _WIN32
 #include <io.h>
@ -16,41 +21,68 @@
 namespace
 {
 #ifdef _WIN32
-    inline int FileOpenRd(const std::wstring& p_path)
+    inline Status FileOpenRd(const std::wstring& p_path, /*out*/ int* p_fd)
    {
-        int fd = -1;
-        bool err = _wsopen_s(&fd, p_path.c_str(), _O_RDONLY | _O_SEQUENTIAL | _O_BINARY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
-        return fd;
+        _wsopen_s(p_fd, p_path.c_str(), _O_RDONLY | _O_SEQUENTIAL | _O_BINARY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
+        if (0 > *p_fd)
+        {
+            return Status(SYSTEM, errno);
+        }
+        return Status::OK();
    }

-    inline int FileOpenWr(const std::wstring& p_path)
+    inline Status FileOpenWr(const std::wstring& p_path, /*out*/ int* p_fd)
    {
-        int fd = -1;
-        _wsopen_s(&fd, p_path.c_str(), _O_CREAT | _O_SEQUENTIAL | _O_BINARY | _O_WRONLY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
-        return fd;
+        _wsopen_s(p_fd, p_path.c_str(), _O_CREAT | _O_SEQUENTIAL | _O_BINARY | _O_WRONLY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
+        if (0 > *p_fd)
+        {
+            return Status(SYSTEM, errno);
+        }
+        return Status::OK();
    }
 #endif

-    inline int FileOpenRd(const std::string& p_path)
+    inline Status FileOpenRd(const std::string& p_path, /*out*/ int* p_fd)
    {
 #ifdef _WIN32
-        int fd = -1;
-        _sopen_s(&fd, p_path.c_str(), _O_RDONLY | _O_SEQUENTIAL | _O_BINARY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
-        return fd;
+        _sopen_s(p_fd, p_path.c_str(), _O_RDONLY | _O_SEQUENTIAL | _O_BINARY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
 #else
-        return open(p_path.c_str(), O_RDONLY);
+        *p_fd = open(p_path.c_str(), O_RDONLY);
 #endif
+        if (0 > *p_fd)
+        {
+            return Status(SYSTEM, errno);
+        }
+        return Status::OK();
    }

-    inline int FileOpenWr(const std::string& p_path)
+    inline Status FileOpenWr(const std::string& p_path, /*out*/ int* p_fd)
    {
 #ifdef _WIN32
-        int fd = -1;
-        _sopen_s(&fd, p_path.c_str(), _O_CREAT | _O_SEQUENTIAL | _O_BINARY | _O_WRONLY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
-        return fd;
+        _sopen_s(p_fd, p_path.c_str(), _O_CREAT | _O_SEQUENTIAL | _O_BINARY | _O_WRONLY, _SH_DENYWR, _S_IREAD | _S_IWRITE);
 #else
-        return open(p_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+        *p_fd = open(p_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
 #endif
+        if (0 > *p_fd)
+        {
+            return Status(SYSTEM, errno);
+        }
+        return Status::OK();
+    }
+
+    inline Status FileClose(int fd)
+    {
+        int ret = 0;
+#ifdef _WIN32
+        ret = _close(fd);
+#else
+        ret = close(fd);
+#endif
+        if (0 != ret)
+        {
+            return Status(SYSTEM, errno);
+        }
+        return Status::OK();
    }
 }

@ -74,9 +106,7 @@ namespace ONNXIR
        const std::string& p_producerVersion,
        const std::string& p_domain,
        VERSION p_modelVersion,
-        const std::string& p_docString,
-        const std::string& p_modelAuthor,
-        const std::string& p_modelLicense)
+        const std::string& p_docString)
    {
        m_graph.reset(new Graph(p_graphName, p_graphDocString));
        m_modelProto.set_ir_version(p_irVersion);
@ -85,8 +115,6 @@ namespace ONNXIR
        m_modelProto.set_domain(p_domain);
        m_modelProto.set_model_version(p_modelVersion);
        m_modelProto.set_doc_string(p_docString);
-        m_modelProto.set_model_author(p_modelAuthor);
-        m_modelProto.set_model_license(p_modelLicense);
    }

    Model::Model(const ModelProto& p_modelProto)
@ -166,31 +194,16 @@ namespace ONNXIR
        m_modelProto.set_doc_string(p_docString);
    }

-    const std::string& Model::ModelAuthor() const
-    {
-        return m_modelProto.model_author();
-    }
-
-    void Model::SetModelAuthor(const std::string& p_modelAuthor)
-    {
-        m_modelProto.set_model_author(p_modelAuthor);
-    }
-
-    const std::string& Model::ModelLicense() const
-    {
-        return m_modelProto.model_license();
-    }
-
-    void Model::SetModelLicense(const std::string& p_modelLicense)
-    {
-        m_modelProto.set_model_license(p_modelLicense);
-    }
-
    Graph* Model::MainGraph()
    {
        return m_graph.get();
    }

+    const Graph* Model::MainGraph() const
+    {
+        return m_graph.get();
+    }
+
    const ModelProto& Model::ToProto()
    {
        *(m_modelProto.mutable_graph()) = m_graph->ToGraphProto();
@ -198,91 +211,92 @@ namespace ONNXIR
    }

 #ifdef _WIN32
-    bool Model::Load(const std::wstring& p_filePath, /*out*/ ModelProto* p_modelProto)
+    Status Model::Load(const std::wstring& p_filePath, std::shared_ptr<Model>* p_model)
    {
-        return Load(FileOpenRd(p_filePath), p_modelProto);
+        int fd;
+        RETURN_IF_ERROR(FileOpenRd(p_filePath, &fd));
+        auto status = Load(fd, p_model);
+        RETURN_IF_ERROR(FileClose(fd));
+        return status;
    }
-    std::shared_ptr<Model> Model::Load(const std::wstring& p_filePath)
+
+    Status Model::Save(Model& p_model, const std::wstring& p_filePath)
    {
-        return Load(FileOpenRd(p_filePath));
-    }
-    bool Model::Save(Model& p_model, const std::wstring& p_filePath)
-    {
-        return Save(p_model.ToProto(), FileOpenWr(p_filePath));
-    }
-    bool Model::Save(const ModelProto& p_modelProto, const std::wstring& p_filePath)
-    {
-        return Save(p_modelProto, FileOpenWr(p_filePath));
+        int fd;
+        RETURN_IF_ERROR(FileOpenWr(p_filePath, &fd));
+        auto status = Save(p_model, fd);
+        RETURN_IF_ERROR(FileClose(fd));
+        return status;
    }
+
 #endif

-    bool Model::Load(const std::string& p_filePath, /*out*/ ModelProto* p_modelProto)
+    Status Model::Load(const std::string& p_filePath, std::shared_ptr<Model>* p_model)
    {
-        return Load(FileOpenRd(p_filePath), p_modelProto);
+        int fd;
+        RETURN_IF_ERROR(FileOpenRd(p_filePath, &fd));
+        auto status = Load(fd, p_model);
+        RETURN_IF_ERROR(FileClose(fd));
+        return status;
    }
-    std::shared_ptr<Model> Model::Load(const std::string& p_filePath)
+
+    Status Model::Save(Model& p_model, const std::string& p_filePath)
    {
-        return Load(FileOpenRd(p_filePath));
-    }
-    bool Model::Save(Model& p_model, const std::string& p_filePath)
-    {
-        return Save(p_model.ToProto(), FileOpenWr(p_filePath));
-    }
-    bool Model::Save(const ModelProto& p_modelProto, const std::string& p_filePath)
-    {
-        return Save(p_modelProto, FileOpenWr(p_filePath));
+        int fd;
+        RETURN_IF_ERROR(FileOpenWr(p_filePath, &fd));
+        auto status = Save(p_model, fd);
+        RETURN_IF_ERROR(FileClose(fd));
+        return status;
    }

    using ::google::protobuf::io::ZeroCopyInputStream;
    using ::google::protobuf::io::FileInputStream;
    using ::google::protobuf::io::CodedInputStream;
-    bool Model::Load(int p_fd, /*out*/ ModelProto* p_modelProto)
+
+    Status Model::Load(int p_fd, std::shared_ptr<Model>* p_model)
    {
-        if (nullptr == p_modelProto || p_fd < 0)
+        if (p_fd < 0 || nullptr == p_model)
        {
-            return false;
+            return Status(ONNX, INVALID_ARGUMENT, "<p_fd> less than 0 or <p_model> is nullptr.");
        }
+
        std::unique_ptr<ZeroCopyInputStream> raw_input(new FileInputStream(p_fd));
        std::unique_ptr<CodedInputStream> coded_input(
            new CodedInputStream(raw_input.get()));
        // Allows protobuf library versions < 3.2.0 to parse messages greater than 64MB.
        coded_input->SetTotalBytesLimit(INT_MAX, INT_MAX);
-        bool result = p_modelProto->ParseFromCodedStream(coded_input.get());
+        ModelProto modelProto;
+        bool result = modelProto.ParseFromCodedStream(coded_input.get());
        coded_input.reset();
        raw_input.reset();
-        close(p_fd);
-        return result;
+        if (!result)
+        {
+            return Status(ONNX, INVALID_PROTOBUF, "Protobuf parsing failed.");
        }

-    std::shared_ptr<Model> Model::Load(int p_fd)
-    {
-        ModelProto modelProto;
-        bool result = Load(p_fd, &modelProto);
-        if (!result || p_fd < 0)
-        {
-            return nullptr;
-        }
-        auto model = std::shared_ptr<Model>(new Model(modelProto));
-        auto status = model->MainGraph()->Resolve();
+        (*p_model).reset(new Model(modelProto));
+        RETURN_IF_ERROR((*p_model)->MainGraph()->Resolve());

-        close(p_fd);
-        if (status.Ok())
-        {
-            return model;
-        }
-        return nullptr;
+        return Status::OK();
    }

-    bool Model::Save(const ModelProto& p_modelProto, int p_fd)
+    Status Model::Save(Model& p_model, int p_fd)
    {
        if (p_fd < 0)
        {
-            return false;
-        }
-        bool result = p_modelProto.SerializeToFileDescriptor(p_fd);
-        close(p_fd);
-        return result;
-    }
+            return Status(ONNX, INVALID_ARGUMENT, "<p_fd> is less than 0.");
        }

-#pragma warning(pop)
+        RETURN_IF_ERROR(p_model.MainGraph()->Resolve());
+        auto& modelProto = p_model.ToProto();
+        bool result = modelProto.SerializeToFileDescriptor(p_fd);
+        if (result)
+        {
+            return Status::OK();
+        }
+        else
+        {
+            return Status(ONNX, INVALID_PROTOBUF, "Protobuf serialization failed.");
+        }
+    }
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/model.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/model.h
@ -29,9 +29,7 @@ namespace ONNXIR
            const std::string& p_producerVersion,
            const std::string& p_domain,
            VERSION p_modelVersion,
-            const std::string& p_modelDocString,
-            const std::string& p_modelAuthor,
-            const std::string& p_modelLicense);
+            const std::string& p_modelDocString);

        Model(const ModelProto& p_modelProto);

@ -71,42 +69,27 @@ namespace ONNXIR
        // Set models' doc string.
        void SetDocString(const std::string& p_docString);

-        // Get model's author.
-        // Return null pointer if not specified.
-        const std::string& ModelAuthor() const;
-        // Set models' author.
-        void SetModelAuthor(const std::string& p_modelAuthor);
-
-        // Get model's license.
-        // Return null pointer if not specified.
-        const std::string& ModelLicense() const;
-        // Set models' license.
-        void SetModelLicense(const std::string& p_modelLicense);
-
        // Get model's main graph.
        // The return pointer is owned by <*this> model.
        Graph* MainGraph();
+        const Graph* MainGraph() const;
+

        // Get model's serlization proto data.
        const ModelProto& ToProto();

 #ifdef _WIN32
-        // wstring versions for Windows only.
-        static bool Save(const ModelProto& p_modelProto, const std::wstring& p_filePath);
-        static bool Save(Model& p_model, const std::wstring& p_filePath);
-        // Load a ModelProto from a file.
-        static bool Load(const std::wstring& p_filePath, /*out*/ ModelProto* p_modelProto);
-        static std::shared_ptr<Model> Load(const std::wstring& p_filePath);
+        static Status Save(Model& p_model, const std::wstring& p_filePath);
+
+        static Status Load(const std::wstring& p_filePath, /*out*/ std::shared_ptr<Model>* p_model);
 #endif
-        // Save a ModelProto to a file.
-        static bool Save(const ModelProto& p_modelProto, const std::string& p_filePath);
-        static bool Save(Model& p_model, const std::string& p_filePath);
-        static bool Save(const ModelProto& p_modelProto, int p_fd);
-        // Load a ModelProto from a file.
-        static bool Load(const std::string& p_filePath, /*out*/ ModelProto* p_modelProto);
-        static std::shared_ptr<Model> Load(const std::string& p_filePath);
-        static bool Load(int p_fd, /*out*/ ModelProto* p_modelProto);
-        static std::shared_ptr<Model> Load(int p_fd);
+        static Status Save(Model& p_model, const std::string& p_filePath);
+
+        static Status Save(Model& p_model, int p_fd);
+
+        static Status Load(const std::string& p_filePath, /*out*/ std::shared_ptr<Model>* p_model);
+
+        static Status Load(int p_fd, /*out*/ std::shared_ptr<Model>* p_model);

    private:

--- a/Source/CNTKv2LibraryDll/proto/onnx/core/op.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/op.cpp
@ -1,6 +1,3 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456)
-
 #include "op.h"
 #include "opsignature.h"
 #include "utils.h"
@ -44,10 +41,12 @@ namespace ONNXIR
        return *this;
    }

+    #pragma warning(disable : 4100) // unused p_optional
    OperatorSchemaSetter&
        OperatorSchemaSetter::Input(const std::string& p_inputName,
            const std::string& p_description,
-            const std::string& p_type)
+            const std::string& p_type,
+            bool p_optional) /* TODO: add logic for this */
    {
        m_inputs.push_back(std::make_tuple(p_inputName, p_description, p_type));
        return *this;
@ -65,11 +64,10 @@ namespace ONNXIR
    OperatorSchemaSetter&
        OperatorSchemaSetter::Attr(const std::string& p_attrName,
            const std::string& p_description,
-            AttrType p_attrType, bool required)
+            AttrType p_attrType, bool /*required*/)
    {
        m_opSchema.m_opSignature.m_attributes.push_back(
            OpSignature::Attribute(p_attrName, p_attrType, p_description));
-
        return *this;
    }

@ -147,6 +145,14 @@ namespace ONNXIR
        return *this;
    }

+    OperatorSchemaSetter& OperatorSchemaSetter::FillUsing(std::function<void(OperatorSchemaSetter&)> populator)
+    {
+        if (populator) {
+            populator(*this);
+        }
+        return *this;
+    }
+
    OperatorSchemaRegistry::RegisterOnce::RegisterOnce(
        OperatorSchemaSetter& p_opSchemaSetter)
    {
@ -198,28 +204,6 @@ namespace ONNXIR
                    opSchema.m_opSignature.m_typeConstraintMap));
        }

-        auto& opSignature = p_opSchemaSetter.m_opSchema.m_opSignature;
-        if (0 == opSignature.m_inputs.size())
-        {
-            for (int i = 0; i < opSignature.m_onnxMinInput; ++i)
-            {
-                std::string name = "p" + std::to_string(i);
-                std::string desc = "Input Parameter " + std::to_string(i);
-                opSignature.m_inputs.push_back(
-                    OpSignature::FormalParameter(name, "", desc, opSignature.m_typeConstraintMap));
-            }
-        }
-
-        if (0 == opSignature.m_outputs.size())
-        {
-            for (int i = 0; i < opSignature.m_onnxMinOutput; ++i)
-            {
-                std::string name = "p" + std::to_string(i);
-                std::string desc = "Output Result " + std::to_string(i);
-                opSignature.m_outputs.push_back(
-                    OpSignature::FormalParameter(name, "", desc, opSignature.m_typeConstraintMap));
-            }
-        }
        OperatorSchemaRegistry::Get()->Register(p_opSchemaSetter.m_opSchema);
    }

@ -246,9 +230,10 @@ namespace ONNXIR
        auto iter = m_opNameToOpSchemaMap.find(p_opSchema.GetName());
        if (m_opNameToOpSchemaMap.end() != iter)
        {
-            Status status(false,
+            Status status(ONNX, FAIL,
                "Error: operator schema with same name ("
                + p_opSchema.GetName() + ") exists.");
+            assert(false);
            return status;
        }
        else
@ -269,86 +254,57 @@ namespace ONNXIR
    {
        if (!OpSignature::IsValidAttribute(p_attr))
        {
-            return Status(false, "Invalid AttributeProto.");
+            return Status(ONNX, FAIL, "Invalid AttributeProto.");
        }

+        p_type = p_attr.type();
+        if (AttrType::AttributeProto_AttributeType_UNDEFINED == p_type)
+        {
            if (p_attr.has_f())
            {
-            p_type = AttrType::FLOAT;
+                p_type = AttrType::AttributeProto_AttributeType_FLOAT;
            }
            else if (p_attr.has_i())
            {
-            p_type = AttrType::INT;
+                p_type = AttrType::AttributeProto_AttributeType_INT;
            }
            else if (p_attr.has_s())
            {
-            p_type = AttrType::STRING;
+                p_type = AttrType::AttributeProto_AttributeType_STRING;
            }
            else if (p_attr.has_t())
            {
-            p_type = AttrType::TENSOR;
+                p_type = AttrType::AttributeProto_AttributeType_TENSOR;
            }
            else if (p_attr.has_g())
            {
-            p_type = AttrType::GRAPH;
+                p_type = AttrType::AttributeProto_AttributeType_GRAPH;
            }
            else if (p_attr.floats_size())
            {
-            p_type = AttrType::FLOATS;
+                p_type = AttrType::AttributeProto_AttributeType_FLOATS;
            }
            else if (p_attr.ints_size())
            {
-            p_type = AttrType::INTS;
+                p_type = AttrType::AttributeProto_AttributeType_INTS;
            }
            else if (p_attr.strings_size())
            {
-            p_type = AttrType::STRINGS;
+                p_type = AttrType::AttributeProto_AttributeType_STRINGS;
            }
            else if (p_attr.tensors_size())
            {
-            p_type = AttrType::TENSORS;
+                p_type = AttrType::AttributeProto_AttributeType_TENSORS;
            }
            else if (p_attr.graphs_size())
            {
-            p_type = AttrType::GRAPHS;
-        }
-        else if (p_attr.has_type())
-        {
-            p_type = AttrType::TYPE;
-        }
-        else if (p_attr.types_size())
-        {
-            p_type = AttrType::TYPES;
-        }
-        else if (p_attr.has_shape())
-        {
-            p_type = AttrType::SHAPE;
-        }
-        else if (p_attr.has_shape())
-        {
-            p_type = AttrType::SHAPES;
+                p_type = AttrType::AttributeProto_AttributeType_GRAPHS;
            }
            else
            {
-            p_type = AttrType::NONE;
-            return Status(false, "Invalid AttributeProto.");
+                return Status(ONNX, FAIL, "Invalid AttributeProto.");
+            }
        }
-
        return Status::OK();
    }
-
-    size_t ReplaceAll(std::string& s, const char* from, const char* to)
-    {
-        size_t numReplaced = 0;
-        std::string::size_type lenFrom = std::strlen(from);
-        std::string::size_type lenTo = std::strlen(to);
-        for (std::string::size_type pos = s.find(from); pos != std::string::npos;
-            pos = s.find(from, pos + lenTo)) {
-            s.replace(pos, lenFrom, to);
-            numReplaced++;
 }
-        return numReplaced;
-    }
-}
-
-#pragma warning(pop)
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/op.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/op.h
@ -10,8 +10,6 @@
 namespace ONNXIR
 {
    class OpSignature;
-    class OperatorSchemaSetter;
-    typedef OperatorSchemaSetter OpSchema;

    class TypeUtils
    {
@ -78,13 +76,28 @@ namespace ONNXIR

        OperatorSchemaSetter& Description(const std::string& p_description);

+        // Grammar for type strings used in Input(), Output(), AttrWithRichType(), and TypeConstraint() api's
+        // <type> ::= <data_type> |
+        //            tensor(<data_type>) |
+        //            sparse(<data_type>) |
+        //            seq(<type>) |
+        //            map(<data_type>, <type>) |
+        //            record(<name_type_list>) |
+        //            union(<name_type_list>)
+        // <name_type_list> :: = <name>:<type>{ ,<name_type_list> }
+        // <data_type> :: = float | uint8 | ...   (see data_type strings defined in constants.h)
        OperatorSchemaSetter& Input(const std::string& p_inputName,
            const std::string& p_description,
-            const std::string& p_type = "");
+            const std::string& p_type = "",
+            bool p_optional = false);

        OperatorSchemaSetter& Output(const std::string& p_outputName,
            const std::string& p_description,
-            const std::string& p_type = "");
+            const std::string& p_type = ""); // see grammar above.
+
+        OperatorSchemaSetter& TypeConstraint(const std::string& p_typeName,
+            const std::vector<std::string>& p_constraints, // see grammar above.
+            const std::string& p_description);

        OperatorSchemaSetter& Attr(const std::string& p_attrName,
            const std::string& p_description,
@ -98,10 +111,6 @@ namespace ONNXIR
        ATTR_SETTER_INTERFACE(TypeProto)
        ATTR_SETTER_INTERFACE(TypeProto::TensorShapeProto)

-        OperatorSchemaSetter& TypeConstraint(const std::string& p_typeName,
-            const std::vector<std::string>& p_constraints,
-            const std::string& p_description);
-
        // Shape inference function will be used to infer outputs' shape with
        // inputs' shape.
        OperatorSchemaSetter& SetShapeInferenceFunc(
@ -112,87 +121,8 @@ namespace ONNXIR
        OperatorSchemaSetter& SetAttributeParser(
            AttributeParser p_attrParser);

-        enum class SupportType {
-            COMMON,
-            EXPERIMENTAL,
-        };
-        // Methods added for compatibility with ONNX OpSchema registration API
-        OpSchema& NumInputs(int n)
-        {
-            return NumInputs(n, n);
-        }
-        OpSchema& NumInputs(int min, int max)
-        {
-            m_opSchema.m_opSignature.m_onnxMinInput = min;
-            m_opSchema.m_opSignature.m_onnxMaxInput = max;
-            return *this;
-        }
-        OpSchema& NumInputs(std::set<int> allowed_input_nums)
-        {
-            return NumInputs([allowed_input_nums](int n)-> bool {
-                return allowed_input_nums.count(n) > 0;
-            });
-        }
-        OpSchema& NumInputs(std::function<bool(int)> func)
-        {
-            m_opSchema.m_opSignature.m_onnxNumInputsAllowed = func;
-            return *this;
-        }
-        OpSchema& NumOutputs(int n) {
-            return NumOutputs(n, n);
-        }
-        OpSchema& NumOutputs(int min, int max)
-        {
-            m_opSchema.m_opSignature.m_onnxMinOutput = min;
-            m_opSchema.m_opSignature.m_onnxMaxOutput = max;
-            return *this;
-        }
-        OpSchema& NumOutputs(std::set<int> allowed_output_nums)
-        {
-            return NumOutputs([allowed_output_nums](int n)-> bool {
-                return allowed_output_nums.count(n) > 0;
-            });
-        }
-        OpSchema& NumOutputs(std::function<bool(int)> func)
-        {
-            m_opSchema.m_opSignature.m_onnxNumOutputsAllowed = func;
-            return *this;
-        }
-        OpSchema& NumInputsOutputs(std::function<bool(int, int)> func)
-        {
-            m_opSchema.m_opSignature.m_onnxNumInputsOutputsAllowed = func;
-            return *this;
-        }
-        OpSchema& OutputCalculator(std::function<int(int)> calc) { return *this; }
-        OpSchema& SameNumberOfOutput() { return *this; }
-        OpSchema& AllowConsumed(std::function<std::pair<bool, int>(int)> inplace) { return *this; }
-        OpSchema& AllowConsumed(std::unordered_map<int, int> inplace) { return *this; }
-        OpSchema& AllowOneToOneConsumed() { return *this; }
-        OpSchema& EnforceConsumed(std::function<std::pair<bool, int>(int)> inplace) { return *this; }
-        OpSchema& EnforceConsumed(std::unordered_map<int, int> inplace) { return *this; }
-        OpSchema& EnforceOneToOneConsumed() { return *this; }
-        OpSchema& SetSupportLevel(SupportType) { return *this; }
-        OpSchema& AllowUncheckedAttributes() { return *this; }
-        OpSchema& FillUsing(std::function<void(OpSchema&)> populator)
-        {
-            if (populator)
-            {
-                populator(*this);
-            }
-            return *this;
-        }
-        OpSchema& Input(const int, const char* name, const char* description)
-        {
-            return Input(name, description);
-        }
-        OpSchema& Output(const int, const char* name, const char* description)
-        {
-            return Output(name, description);
-        }
-        OpSchema& SetDoc(const std::string& doc)
-        {
-            return Description(doc);
-        }
+        // adding docs for temlated/macro ops.
+        OperatorSchemaSetter& FillUsing(std::function<void(OperatorSchemaSetter&)> populator);

    private:

@ -244,9 +174,6 @@ namespace ONNXIR
        std::unordered_map<std::string, OperatorSchema> m_opNameToOpSchemaMap;
    };

-    // utility function used by ONNX v1 op registration defs.
-    size_t ReplaceAll(std::string& s, const char* from, const char* to);
-
 #define REGISTER_OPERATOR_SCHEMA(OpName) OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, OpName)
 #define OPERATOR_SCHEMA_UNIQ_HELPER(Counter, OpName) OPERATOR_SCHEMA_UNIQ(Counter, OpName)
 #define OPERATOR_SCHEMA_UNIQ(Counter, OpName)                     \
@ -254,11 +181,11 @@ namespace ONNXIR
    = OperatorSchemaSetter().Name(#OpName)

    // Operator registration example.
-    // OPERATOR_DEFINITION(Add).Description("An operator to sum two float numbers.")
+    // REGISTER_OPERATOR_SCHEMA(Add).Description("An operator to sum two float numbers.")
    //   .Input("input_1", "docstr for input_1.", "T")
    //   .Input("input_2", "docstr for input_2.", "T")
    //   .Output("output_1", "docstr for output_1.", "T")
-    //   .TypeConstraint("T", { "float16", "float32", "float64" }, "Constrain input and output types to floats.");
+    //   .TypeConstraint("T", { "float16", "float", "double" }, "Constrain input and output types to floats.");
 }

 #endif
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/opsignature.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/opsignature.cpp
@ -126,6 +126,8 @@ namespace ONNXIR
            return false;
        }

+        if (p_attr.type() == AttributeProto_AttributeType_UNDEFINED)
+        {
            int num_fields =
                p_attr.has_f() +
                p_attr.has_i() +
@ -136,16 +138,13 @@ namespace ONNXIR
                (p_attr.ints_size() > 0) +
                (p_attr.strings_size() > 0) +
                (p_attr.tensors_size() > 0) +
-            (p_attr.graphs_size() > 0) +
-            p_attr.has_type() +
-            (p_attr.types_size() > 0) +
-            p_attr.has_shape() +
-            (p_attr.shapes_size() > 0);
+                (p_attr.graphs_size() > 0);

-        if (num_fields == 1)
+            if (num_fields != 1)
            {
-            return true;
-        }
                return false;
            }
        }
+        return true;
+    }
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/opsignature.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/opsignature.h
@ -1,34 +1,19 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456)
-
 #ifndef CORE_GRAPH_OPSCHEMA_H
 #define CORE_GRAPH_OPSCHEMA_H

 #include <functional>
 #include <unordered_map>

+#pragma warning(push)
+#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456 4189 4996)
 #include "proto/onnx/protobuf/graph.pb.h"
+#pragma warning(pop)
+
 #include "utils.h"

 namespace ONNXIR
 {
-    enum class AttrType {
-        NONE,
-        FLOAT,
-        INT,
-        STRING,
-        GRAPH,
-        TENSOR,
-        TYPE,
-        SHAPE,
-        FLOATS,
-        INTS,
-        STRINGS,
-        GRAPHS,
-        TENSORS,
-        TYPES,
-        SHAPES
-    };
+    typedef AttributeProto_AttributeType AttrType;

    // This string array should exactly match the AttrType defined above.
    static const std::string c_attrTypeStr[14] =
@ -38,15 +23,11 @@ namespace ONNXIR
        "STRING",
        "GRAPH",
        "TENSOR",
-        "TYPE",
-        "SHAPE",
        "FLOATS",
        "INTS",
        "STRINGS",
        "GRAPHS",
-        "TENSORS",
-        "TYPES",
-        "SHAPES"
+        "TENSORS"
    };

    typedef std::unordered_set<PTYPE> DataTypeSet;
@ -178,25 +159,6 @@ namespace ONNXIR
        // Get type constraint map.
        const TypeConstraintMap& GetTypeConstraintMap() const;

-        // To support ONNX variable input/output compatibility.
-        // Min and Max num arguments of last input/output.
-        int GetOnnxMinInput() const { return m_onnxMinInput; }
-        int GetOnnxMaxInput() const { return m_onnxMaxInput; }
-        int GetOnnxMinOutput() const { return m_onnxMinOutput; }
-        int GetOnnxMaxOutput() const { return m_onnxMaxOutput; }
-        std::function<bool(int)> GetOnnxNumInputsAllowedFunc() const
-        {
-            return m_onnxNumInputsAllowed;
-        }
-        std::function<bool(int)> GetOnnxNumOutputsAllowedFunc() const
-        {
-            return m_onnxNumOutputsAllowed;
-        }
-        std::function<bool(int, int)> GetOnnxNumInputsOutputsAllowedFunc() const
-        {
-            return m_onnxNumInputsOutputsAllowed;
-        }
-
    private:

        friend class OperatorSchemaSetter;
@ -219,21 +181,6 @@ namespace ONNXIR

        // Map from constraint name to DataTypeSet
        TypeConstraintMap m_typeConstraintMap;
-
-        // To support ONNX variable input/output compatibility.
-        // Min and Max num arguments of last input/output.
-        int m_onnxMinInput = 0;
-        int m_onnxMaxInput = std::numeric_limits<int>::max();
-        int m_onnxMinOutput = 0;
-        int m_onnxMaxOutput = std::numeric_limits<int>::max();
-        std::function<bool(int)> m_onnxNumInputsAllowed =
-            [](int) { return true; };
-        std::function<bool(int)> m_onnxNumOutputsAllowed =
-            [](int) { return true; };
-        std::function<bool(int, int)> m_onnxNumInputsOutputsAllowed =
-            [](int, int) { return true; };
    };
 }
 #endif
-
-#pragma warning(pop)
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/status.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/status.cpp
@ -1,32 +1,114 @@
 #include "status.h"

+#include <string>
+
 namespace ONNXIR
 {
-    Status::Status(bool p_ok, const std::string& p_errMsg)
+    namespace Common
    {
-        m_ok = p_ok;
-        m_errMsg = p_errMsg;
+        Status::Status(StatusCategory p_category, int p_code, const std::string& p_msg)
+        {
+            m_state.reset(new State());
+            m_state->m_category = p_category;
+            m_state->m_code = p_code;
+            m_state->m_msg = p_msg;
        }

-    Status::Status(const Status& p_other)
+        Status::Status(StatusCategory p_category, int p_code)
+            : Status(p_category, p_code, EmptyString())
        {
-        m_ok = p_other.m_ok;
-        m_errMsg = p_other.m_errMsg;
        }

        bool Status::Ok() const
        {
-        return m_ok;
+            return (m_state == NULL);
        }

-    const std::string& Status::ErrorMsg() const
+        StatusCategory Status::Category() const
        {
-        return m_errMsg;
+            return Ok() ? StatusCategory::NONE : m_state->m_category;
        }

-    Status Status::OK()
+        int Status::Code() const
        {
-        static Status ok(true, "");
-        return ok;
+            return Ok() ? static_cast<int>(StatusCode::OK) : m_state->m_code;
+        }
+
+        const std::string& Status::ErrorMessage() const
+        {
+            return Ok() ? EmptyString() : m_state->m_msg;
+        }
+
+        std::string Status::ToString() const
+        {
+            if (m_state == nullptr)
+            {
+                return std::string("OK");
+            }
+
+            std::string result;
+
+            if (StatusCategory::SYSTEM == m_state->m_category)
+            {
+                result += "SystemError";
+                result += " : ";
+                result += std::to_string(errno);
+            }
+            else if (StatusCategory::ONNX == m_state->m_category)
+            {
+                result += "[ONNXError]";
+                result += " : ";
+                result += std::to_string(static_cast<int>(Code()));
+                std::string msg;
+                switch (static_cast<StatusCode>(Code()))
+                {
+                case INVALID_ARGUMENT:
+                    msg = "INVALID_ARGUMENT";
+                    break;
+                case NO_SUCHFILE:
+                    msg = "NO_SUCHFILE";
+                    break;
+                case NO_MODEL:
+                    msg = "NO_MODEL";
+                    break;
+                case ENGINE_ERROR:
+                    msg = "ENGINE_ERROR";
+                    break;
+                case RUNTIME_EXCEPTION:
+                    msg = "RUNTIME_EXCEPTION";
+                    break;
+                case INVALID_PROTOBUF:
+                    msg = "INVALID_PROTOBUF";
+                    break;
+                case MODEL_LOADED:
+                    msg = "MODEL_LOADED";
+                    break;
+                case ONNX_NOT_IMPLEMENTED:
+                    msg = "NOT_IMPLEMENTED";
+                    break;
+                default:
+                    msg = "GENERAL ERROR";
+                    break;
+                }
+                result += " : ";
+                result += msg;
+                result += " : ";
+                result += m_state->m_msg;
+            }
+
+            return result;
+        }
+
+        const Status& Status::OK()
+        {
+            static Status s_ok;
+            return s_ok;
+        }
+
+        const std::string& Status::EmptyString()
+        {
+            static std::string s_emptyStr = "";
+            return s_emptyStr;
+        }
    }
 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/status.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/status.h
@ -1,41 +1,105 @@
 #ifndef CORE_GRAPH_STATUS_H
 #define CORE_GRAPH_STATUS_H

+#include <memory>
 #include <string>

+
 namespace ONNXIR
 {
+    namespace Common
+    {

 #define RETURN_IF_ERROR(expr)             \
  do {                                    \
-    auto status = (expr);               \
-    if ((!status.Ok())) return status;  \
+    auto _status = (expr);                \
+    if ((!_status.Ok())) return _status;  \
  } while (0)

+        enum StatusCategory
+        {
+            NONE = 0,
+            SYSTEM = 1,
+            ONNX = 2,
+        };
+
+        // Error code for ONNX.
+        enum StatusCode
+        {
+            OK = 0,
+            FAIL = 1,
+            INVALID_ARGUMENT = 2,
+            NO_SUCHFILE = 3,
+            NO_MODEL = 4,
+            ENGINE_ERROR = 5,
+            RUNTIME_EXCEPTION = 6,
+            INVALID_PROTOBUF = 7,
+            MODEL_LOADED = 8,
+            ONNX_NOT_IMPLEMENTED = 9,
+        };
+
        class Status
        {
        public:
-        Status() = delete;

-        // Constructor.
-        Status(bool p_ok, const std::string& p_errMsg);
+            Status() {}

-        // Copy constructor.
-        Status(const Status& p_other);
+            Status(StatusCategory p_category, int p_code, const std::string& p_msg);
+
+            Status(StatusCategory p_category, int p_code);
+
+            inline Status(const Status& p_other)
+                : m_state((p_other.m_state == NULL) ? NULL : new State(*p_other.m_state)) {}

-        // Getter of <m_ok>.
            bool Ok() const;

-        // Getter of <m_errMsg>.
-        const std::string& ErrorMsg() const;
+            int Code() const;

-        static Status OK();
+            StatusCategory Category() const;
+
+            const std::string& ErrorMessage() const;
+
+            std::string ToString() const;
+
+            inline void operator=(const Status& p_other)
+            {
+                if (nullptr == p_other.m_state)
+                {
+                    m_state.reset();
+                }
+                else if (m_state != p_other.m_state)
+                {
+                    m_state.reset(new State(*p_other.m_state));
+                }
+            }
+
+            inline bool operator==(const Status& p_other) const
+            {
+                return (this->m_state == p_other.m_state) || (ToString() == p_other.ToString());
+            }
+
+            inline bool operator!=(const Status& p_other) const
+            {
+                return !(*this == p_other);
+            }
+
+            static const Status& OK();

        private:

-        bool m_ok;
-        std::string m_errMsg;
+            static const std::string& EmptyString();
+
+            struct State
+            {
+                StatusCategory m_category;
+                int m_code;
+                std::string m_msg;
            };
+
+            // As long as Code() is OK, m_state == NULL.
+            std::unique_ptr<State> m_state;
+        };
+    }
 }

 #endif // !CORE_GRAPH_STATUS_H
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/tensorutils.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/tensorutils.cpp
@ -0,0 +1,119 @@
+#include "tensorutils.h"
+
+namespace ONNXIR
+{
+    using namespace Common;
+
+    namespace Utils
+    {
+        bool TensorUtils::IsLittleEndianOrder()
+        {
+            int n = 1;
+            return (*(char*)&n == 1);
+        }
+
+        Status TensorUtils::UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<std::string>* p_data)
+        {
+            if (ONNXIR::TensorProto_DataType_STRING != p_tensor.data_type()
+                || nullptr == p_data)
+            {
+                return Status(StatusCategory::ONNX, StatusCode::INVALID_ARGUMENT);
+            }
+
+            p_data->clear();
+            for (auto& elem : p_tensor.string_data())
+            {
+                p_data->push_back(elem);
+            }
+            return Status::OK();
+        }
+
+        Status TensorUtils::UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<float>* p_data)
+        {
+            if (ONNXIR::TensorProto_DataType_FLOAT != p_tensor.data_type()
+                || nullptr == p_data)
+            {
+                return Status(StatusCategory::ONNX, StatusCode::INVALID_ARGUMENT);
+            }
+
+            p_data->clear();
+            if (p_tensor.has_raw_data())
+            {
+                UnpackTensorWithRawData(p_tensor, p_data);
+                return Status::OK();
+            }
+
+            for (auto elem : p_tensor.float_data())
+            {
+                p_data->push_back(elem);
+            }
+            return Status::OK();
+        }
+
+        Status TensorUtils::UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<int32_t>* p_data)
+        {
+            if (ONNXIR::TensorProto_DataType_INT32 != p_tensor.data_type()
+                || nullptr == p_data)
+            {
+                return Status(StatusCategory::ONNX, StatusCode::INVALID_ARGUMENT);
+            }
+
+            p_data->clear();
+            if (p_tensor.has_raw_data())
+            {
+                UnpackTensorWithRawData(p_tensor, p_data);
+                return Status::OK();
+            }
+
+            for (auto elem : p_tensor.int32_data())
+            {
+                p_data->push_back(elem);
+            }
+            return Status::OK();
+        }
+
+        Status TensorUtils::UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<bool>* p_data)
+        {
+            if (ONNXIR::TensorProto_DataType_BOOL != p_tensor.data_type()
+                || nullptr == p_data)
+            {
+                return Status(StatusCategory::ONNX, StatusCode::INVALID_ARGUMENT);
+            }
+
+            p_data->clear();
+            if (p_tensor.has_raw_data())
+            {
+                UnpackTensorWithRawData(p_tensor, p_data);
+                return Status::OK();
+            }
+
+            for (auto elem : p_tensor.int32_data())
+            {
+                p_data->push_back(elem != 0);
+            }
+            return Status::OK();
+        }
+
+        Status TensorUtils::UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<int64_t>* p_data)
+        {
+            if (ONNXIR::TensorProto_DataType_INT64 != p_tensor.data_type()
+                || nullptr == p_data)
+            {
+                return Status(StatusCategory::ONNX, StatusCode::INVALID_ARGUMENT);
+            }
+
+            p_data->clear();
+            if (p_tensor.has_raw_data())
+            {
+                UnpackTensorWithRawData(p_tensor, p_data);
+                return Status::OK();
+            }
+
+            for (auto elem : p_tensor.int64_data())
+            {
+                p_data->push_back(elem);
+            }
+            return Status::OK();
+        }
+    }
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/tensorutils.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/tensorutils.h
@ -0,0 +1,64 @@
+#ifndef ONNXIR_CORE_GRAPH_TENSORUTILS_H
+#define ONNXIR_CORE_GRAPH_TENSORUTILS_H
+
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456 4189 4996)
+#include "proto/onnx/protobuf/graph.pb.h"
+#pragma warning(pop)
+
+#include "status.h"
+
+namespace ONNXIR
+{
+    namespace Utils
+    {
+        class TensorUtils
+        {
+        public:
+
+            static Common::Status UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<std::string>* p_data);
+
+            static Common::Status UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<float>* p_data);
+
+            static Common::Status UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<int32_t>* p_data);
+
+            static Common::Status UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<bool>* p_data);
+
+            static Common::Status UnpackTensor(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<int64_t>* p_data);
+
+        private:
+
+            static bool IsLittleEndianOrder();
+
+            template <typename T>
+            static void UnpackTensorWithRawData(const ONNXIR::TensorProto& p_tensor, /*out*/ std::vector<T>* p_data)
+            {
+                auto& raw_data = p_tensor.raw_data();
+                auto buff = raw_data.c_str();
+                size_t typeSize = sizeof(T);
+
+                for (size_t i = 0; i < raw_data.size(); i += typeSize, buff += typeSize)
+                {
+                    T result;
+                    if (IsLittleEndianOrder())
+                    {
+                        memcpy((void*)&result, (void*)buff, typeSize);
+                    }
+                    else
+                    {
+                        const char* tempBytes = reinterpret_cast<char*>(&result);
+                        for (size_t j = 0; j < typeSize; ++j)
+                        {
+                            memcpy((void*)&tempBytes[j], (void*)&buff[typeSize - 1 - i], sizeof(char));
+                        }
+                    }
+                    p_data->push_back(result);
+                }
+            }
+        };
+    }
+}
+
+#endif
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/utils.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/utils.cpp
@ -1,13 +1,9 @@
-#pragma warning(push)
-#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456)
-
 #include <cctype>
 #include <iterator>
 #include <iostream>
 #include <sstream>

 #include "constants.h"
-#include "proto/onnx/protobuf/graph.pb.h"
 #include "utils.h"

 namespace ONNXIR
@ -16,14 +12,20 @@ namespace ONNXIR
    {
        std::unordered_map<std::string, TypeProto>& OpUtils::GetTypeStrToProtoMap()
        {
-            static std::unordered_map<std::string, TypeProto>* typeStrToProtoMap =
-                new std::unordered_map<std::string, TypeProto>();
-            return *typeStrToProtoMap;
+            static std::unordered_map<std::string, TypeProto> map;
+            return map;
+        }
+
+        std::mutex& OpUtils::GetTypeStrLock()
+        {
+            static std::mutex lock;
+            return lock;
        }

        PTYPE OpUtils::ToType(const TypeProto& p_type)
        {
            auto typeStr = ToString(p_type);
+            std::lock_guard<std::mutex> lock(GetTypeStrLock());
            if (GetTypeStrToProtoMap().find(typeStr) == GetTypeStrToProtoMap().end())
            {
                GetTypeStrToProtoMap()[typeStr] = p_type;
@ -40,54 +42,69 @@ namespace ONNXIR

        const TypeProto& OpUtils::ToTypeProto(const PTYPE& p_type)
        {
+            std::lock_guard<std::mutex> lock(GetTypeStrLock());
            auto it = GetTypeStrToProtoMap().find(*p_type);
-            if (it != GetTypeStrToProtoMap().end())
-            {
+            assert(it != GetTypeStrToProtoMap().end());
            return it->second;
        }
-            else
-            {
-                throw std::invalid_argument("PTYPE not found: " + *p_type);
-            }
-        }

-        std::string OpUtils::ToString(const TypeProto& p_type)
+        std::string OpUtils::ToString(const TypeProto& p_type, const std::string& left, const std::string& right)
        {
            switch (p_type.value_case())
            {
            case TypeProto::ValueCase::kTensorType:
-                return ToString(p_type.tensor_type().elem_type());
+            {
+                if (p_type.tensor_type().has_shape()
+                    && p_type.tensor_type().shape().dim_size() == 0)
+                {
+                    // Scalar case.
+                    return left + ToDataTypeString(p_type.tensor_type().elem_type()) + right;
+                }
+                else
+                {
+                    return left + "tensor(" + ToDataTypeString(p_type.tensor_type().elem_type()) + ")" + right;
+                }
+            }
            case TypeProto::ValueCase::kSparseTensorType:
-                return "sparse(" + ToString(p_type.sparse_tensor_type().elem_type()) + ")";
-            case TypeProto::ValueCase::kSeqType:
-                return "seq(" + ToString(p_type.seq_type().elem_type()) + ")";
-            case TypeProto::ValueCase::kTupleType:
-            {
-                int size = p_type.tuple_type().elem_type_size();
-                std::string tuple_str("tuple(");
-                for (int i = 0; i < size - 1; i++)
-                {
-                    tuple_str = tuple_str + ToString(p_type.tuple_type().elem_type(i)) + ",";
-                }
-                tuple_str += ToString(p_type.tuple_type().elem_type(size - 1));
-                tuple_str += ")";
-                return tuple_str;
-            }
+                return left + "sparse(" + ToDataTypeString(p_type.sparse_tensor_type().elem_type()) + ")" + right;
+            case TypeProto::ValueCase::kSequenceType:
+                return ToString(p_type.sequence_type().elem_type(), left + "seq(", ")" + right);
            case TypeProto::ValueCase::kMapType:
            {
-                std::string map_str("map(");
-                map_str = map_str + ToString(p_type.map_type().key_type()) + ","
-                    + ToString(p_type.map_type().value_type()) + ")";
-                return map_str;
+                std::string map_str = "map(" + ToDataTypeString(p_type.map_type().key_type()) + ",";
+                return ToString(p_type.map_type().value_type(), left + map_str, ")" + right);
+            }
+            case TypeProto::ValueCase::kRecordType:
+            {
+                std::string record_str("record(");
+                int size = p_type.record_type().field_size();
+                for (int i = 0; i < size - 1; i++)
+                {
+                    record_str = ToString(p_type.record_type().field(i).type(),
+                        record_str + p_type.record_type().field(i).name() + ":" , ",");
+                }
+                record_str += p_type.record_type().field(size - 1).name() + ":";
+                return ToString(p_type.record_type().field(size - 1).type(), left + record_str, ")" + right);
+            }
+            case TypeProto::ValueCase::kUnionType:
+            {
+                std::string union_str("union(");
+                int size = p_type.union_type().choice_size();
+                for (int i = 0; i < size - 1; i++)
+                {
+                    union_str = ToString(p_type.union_type().choice(i).type(),
+                        union_str + p_type.union_type().choice(i).name() + ":", ",");
+                }
+                union_str += p_type.union_type().choice(size - 1).name() + ":";
+                return ToString(p_type.union_type().choice(size - 1).type(), left + union_str, ")" + right);
            }
-            case TypeProto::ValueCase::kHandleType:
-                return "handle";
            default:
-                throw std::invalid_argument("Unknown TypeProto");
+                assert(false);
+                return "";
            }
        }

-        std::string OpUtils::ToString(const TensorProto::DataType& p_type)
+        std::string OpUtils::ToDataTypeString(const TensorProto::DataType& p_type)
        {
            TypesWrapper& t = TypesWrapper::GetTypesWrapper();
            switch (p_type)
@ -124,81 +141,226 @@ namespace ONNXIR
                return t.c_complex128;
            }

-            throw std::invalid_argument("Unknown DataType");
+            assert(false);
+            return "";
        }

+        std::string OpUtils::ToAttrTypeString(const ValueProto& p_value, const std::string& left, const std::string& right)
+        {
+            switch (p_value.value_case())
+            {
+            case ValueProto::ValueCase::kDenseTensor:
+            {
+                if (p_value.dense_tensor().dims_size() == 0)
+                {
+                    // Scalar case.
+                    return left + ToDataTypeString(p_value.dense_tensor().data_type()) + right;
+                }
+                else
+                {
+                    return left + "tensor(" + ToDataTypeString(p_value.dense_tensor().data_type()) + ")" + right;
+                }
+            }
+            case ValueProto::ValueCase::kSparseTensor:
+                return left + "sparse(" + ToDataTypeString(p_value.sparse_tensor().values().data_type()) + ")" + right;
+            case ValueProto::ValueCase::kSeq:
+            {
+                assert(p_value.seq().elems_size() > 0);
+                return ToAttrTypeString(p_value.seq().elems(0), left + "seq(", ")" + right);
+            }
+            case ValueProto::ValueCase::kScalarMap:
+            {
+                const int keys_size = p_value.scalar_map().keys_size();
+                const int values_size = p_value.scalar_map().values_size();
+                Ignore(keys_size, values_size);
+                assert(keys_size > 0);
+                assert(values_size > 0);
+                assert(keys_size == values_size);
+                std::string map_str = "map(" + ToDataTypeString(p_value.scalar_map().keys(0).data_type()) + ","
+                    + "tensor(" + ToDataTypeString(p_value.scalar_map().values(0).data_type()) + "))";
+                return map_str;
+            }
+            case ValueProto::ValueCase::kMap:
+            {
+                assert(p_value.map().key_value_pairs_size() > 0);
+                std::string map_str("map(");
+                std::string key_str;
+
+                TypesWrapper& t = TypesWrapper::GetTypesWrapper();
+                switch (p_value.map().key_value_pairs(0).key_case())
+                {
+                case ValueProto_KeyValuePairProto::KeyCase::kS:
+                    key_str = t.c_string;
+                    break;
+                case ValueProto_KeyValuePairProto::KeyCase::kI32:
+                    key_str = t.c_int32;
+                    break;
+                case ValueProto_KeyValuePairProto::KeyCase::kI64:
+                    key_str = t.c_int64;
+                    break;
+                case ValueProto_KeyValuePairProto::KeyCase::kUi64:
+                    key_str = t.c_uint64;
+                    break;
+                default:
+                    assert(false);
+                }
+                map_str += key_str + ",";
+                return ToAttrTypeString(p_value.map().key_value_pairs(0).value(), left + map_str, ")" + right);
+            }
+            case ValueProto::ValueCase::kRecord:
+            {
+                int fields_size = p_value.record().fields_size();
+                assert(fields_size > 0);
+                std::string record_str("record(");
+                for (int i = 0; i < fields_size - 1; i++)
+                {
+                    record_str = ToAttrTypeString(p_value.record().fields(i).value(),
+                        record_str + p_value.record().fields(i).key() + ":", ",");
+                }
+                record_str += p_value.record().fields(fields_size - 1).key() + ":";
+                return ToAttrTypeString(p_value.record().fields(fields_size - 1).value(), left + record_str, ")" + right);
+            }
+            case ValueProto::ValueCase::kUnion:
+            {
+                assert(p_value.union_().has_choice());
+                std::string union_str = "union(" + p_value.union_().choice().key() + ":";
+                return ToAttrTypeString(p_value.union_().choice().value(), left + union_str, ")" + right);
+            }
+            default:
+                assert(false);
+                return "";
+            }
+        }

        void OpUtils::FromString(const std::string& p_src, TypeProto& p_type)
        {
            StringRange s(p_src);
-            s.LAndRStrip();
            p_type.Clear();

-            if (s.LStrip("seq("))
+            if (s.LStrip("seq"))
            {
-                s.RStrip(")");
-                FromString(std::string(s.Data(), s.Size()), *p_type.mutable_seq_type()->mutable_elem_type());
+                s.ParensWhitespaceStrip();
+                return FromString(std::string(s.Data(), s.Size()), *p_type.mutable_sequence_type()->mutable_elem_type());
            }
-            else if (s.LStrip("tuple("))
-            {
-                s.RStrip(")");
-                std::istringstream types(std::string(s.Data(), s.Size()));
-                std::string type;
-                while (std::getline(types, type, ','))
-                {
-                    FromString(type, *p_type.mutable_tuple_type()->mutable_elem_type()->Add());
-                }
-            }
-            else if (s.LStrip("map("))
+            else if (s.LStrip("map"))
            {
+                s.ParensWhitespaceStrip();
                size_t key_size = s.Find(',');
                StringRange k(s.Data(), key_size);
                std::string key = std::string(k.Data(), k.Size());
                s.LStrip(key_size);
                s.LStrip(",");
-                size_t val_size = s.Find(')');
-                StringRange v(s.Data(), val_size);
-                std::string val = std::string(v.Data(), v.Size());
-
+                StringRange v(s.Data(), s.Size());
                TensorProto::DataType key_type;
-                FromString(key, key_type);
-                TensorProto::DataType val_type;
-                FromString(val, val_type);
+                FromDataTypeString(key, key_type);
                p_type.mutable_map_type()->set_key_type(key_type);
-                p_type.mutable_map_type()->set_value_type(val_type);
+                return FromString(std::string(v.Data(), v.Size()), *p_type.mutable_map_type()->mutable_value_type());
            }
-            else if (s.LStrip("handle"))
+            else if (s.LStrip("record"))
            {
-                p_type.mutable_handle_type();
+                s.ParensWhitespaceStrip();
+                std::vector<StringRange> fields;
+                SplitStringTokens(s, fields);
+                for (auto& f : fields)
+                {
+                    ValueInfoProto* valueinfo = p_type.mutable_record_type()->mutable_field()->Add();
+                    size_t name_size = f.Find(':');
+                    StringRange n(f.Data(), name_size);
+                    std::string name = std::string(n.Data(), n.Size());
+                    valueinfo->set_name(name);
+                    f.LStrip(name_size);
+                    f.LStrip(":");
+                    FromString(std::string(f.Data(), f.Size()), *valueinfo->mutable_type());
                }
-            else if (s.LStrip("sparse("))
+            }
+            else if (s.LStrip("union"))
            {
-                s.RStrip(")");
+                s.ParensWhitespaceStrip();
+                std::vector<StringRange> choices;
+                SplitStringTokens(s, choices);
+                for (auto& c : choices)
+                {
+                    ValueInfoProto* valueinfo = p_type.mutable_union_type()->mutable_choice()->Add();
+                    size_t name_size = c.Find(':');
+                    StringRange n(c.Data(), name_size);
+                    std::string name = std::string(n.Data(), n.Size());
+                    valueinfo->set_name(name);
+                    c.LStrip(name_size);
+                    c.LStrip(":");
+                    FromString(std::string(c.Data(), c.Size()), *valueinfo->mutable_type());
+                }
+            }
+            else if (s.LStrip("sparse"))
+            {
+                s.ParensWhitespaceStrip();
                TensorProto::DataType e;
-                FromString(std::string(s.Data(), s.Size()), e);
+                FromDataTypeString(std::string(s.Data(), s.Size()), e);
                p_type.mutable_sparse_tensor_type()->set_elem_type(e);
            }
+            else if (s.LStrip("tensor"))
+            {
+                s.ParensWhitespaceStrip();
+                TensorProto::DataType e;
+                FromDataTypeString(std::string(s.Data(), s.Size()), e);
+                p_type.mutable_tensor_type()->set_elem_type(e);
+            }
            else
            {
-                // dense tensor
+                // Scalar
                TensorProto::DataType e;
-                FromString(std::string(s.Data(), s.Size()), e);
-                p_type.mutable_tensor_type()->set_elem_type(e);
+                FromDataTypeString(std::string(s.Data(), s.Size()), e);
+                TypeProto::TensorTypeProto* t = p_type.mutable_tensor_type();
+                t->set_elem_type(e);
+                // Call mutable_shape() to initialize a shape with no dimension.
+                t->mutable_shape();
            }
        }

        bool OpUtils::IsValidDataTypeString(const std::string& p_dataType)
        {
            TypesWrapper& t = TypesWrapper::GetTypesWrapper();
-            return (t.GetAllowedDataTypes().find(p_dataType) != t.GetAllowedDataTypes().end());
+            const auto& allowedSet = t.GetAllowedDataTypes();
+            return (allowedSet.find(p_dataType) != allowedSet.end());
        }

-        void OpUtils::FromString(const std::string& p_typeStr, TensorProto::DataType& p_type)
+        void OpUtils::SplitStringTokens(StringRange& p_src, std::vector<StringRange>& p_tokens)
        {
-            if (!IsValidDataTypeString(p_typeStr))
+            int parens = 0;
+            p_src.RestartCapture();
+            while (p_src.Size() > 0)
            {
-                throw std::invalid_argument("Unknown DataType: " + p_typeStr);
+                if (p_src.StartsWith(","))
+                {
+                    if (parens == 0)
+                    {
+                        p_tokens.push_back(p_src.GetCaptured());
+                        p_src.LStrip(",");
+                        p_src.RestartCapture();
                    }
+                    else
+                    {
+                        p_src.LStrip(",");
+                    }
+                }
+                else if (p_src.LStrip("("))
+                {
+                    parens++;
+                }
+                else if (p_src.LStrip(")"))
+                {
+                    parens--;
+                }
+                else
+                {
+                    p_src.LStrip(1);
+                }
+            }
+            p_tokens.push_back(p_src.GetCaptured());
+        }
+
+        void OpUtils::FromDataTypeString(const std::string& p_typeStr, TensorProto::DataType& p_type)
+        {
+            assert(IsValidDataTypeString(p_typeStr));

            TypesWrapper& t = TypesWrapper::GetTypesWrapper();
            if (p_typeStr == t.c_bool)
@ -263,25 +425,32 @@ namespace ONNXIR
            }
            else
            {
-                p_type = TensorProto::DataType::TensorProto_DataType_UNDEFINED;
+                assert(false);
            }
        }

        StringRange::StringRange()
-            : m_data(""), m_size(0)
+            : m_data(""), m_size(0), m_start(m_data), m_end(m_data)
        {}

        StringRange::StringRange(const char* p_data, size_t p_size)
-            : m_data(p_data), m_size(p_size)
-        {}
+            : m_data(p_data), m_size(p_size), m_start(m_data), m_end(m_data)
+        {
+            assert(p_data != nullptr);
+            LAndRStrip();
+        }

        StringRange::StringRange(const std::string& p_str)
-            : m_data(p_str.data()), m_size(p_str.size())
-        {}
+            : m_data(p_str.data()), m_size(p_str.size()), m_start(m_data), m_end(m_data)
+        {
+            LAndRStrip();
+        }

        StringRange::StringRange(const char* p_data)
-            : m_data(p_data), m_size(strlen(p_data))
-        {}
+            : m_data(p_data), m_size(strlen(p_data)), m_start(m_data), m_end(m_data)
+        {
+            LAndRStrip();
+        }

        const char* StringRange::Data() const
        {
@ -307,18 +476,21 @@ namespace ONNXIR
        {
            m_data = "";
            m_size = 0;
+            m_start = m_end = m_data;
        }

        void StringRange::Reset(const char* p_data, size_t p_size)
        {
            m_data = p_data;
            m_size = p_size;
+            m_start = m_end = m_data;
        }

        void StringRange::Reset(const std::string& p_str)
        {
            m_data = p_str.data();
            m_size = p_str.size();
+            m_start = m_end = m_data;
        }

        bool StringRange::StartsWith(const StringRange& p_str) const
@ -346,12 +518,14 @@ namespace ONNXIR
            }
            return false;
        }
+
        bool StringRange::LStrip(size_t p_size)
        {
            if (p_size <= m_size)
            {
                m_data += p_size;
                m_size -= p_size;
+                m_end += p_size;
                return true;
            }
            return false;
@ -400,7 +574,18 @@ namespace ONNXIR

        bool StringRange::LAndRStrip()
        {
-            return LStrip() || RStrip();
+            bool l = LStrip();
+            bool r = RStrip();
+            return l || r;
+        }
+
+        void StringRange::ParensWhitespaceStrip()
+        {
+            LStrip();
+            LStrip("(");
+            LAndRStrip();
+            RStrip(")");
+            RStrip();
        }

        size_t StringRange::Find(const char p_ch) const
@ -416,7 +601,16 @@ namespace ONNXIR
            }
            return std::string::npos;
        }
-    }
+
+        void StringRange::RestartCapture()
+        {
+            m_start = m_data;
+            m_end = m_data;
        }

-#pragma warning(pop)
+        StringRange StringRange::GetCaptured()
+        {
+            return StringRange(m_start, m_end - m_start);
+        }
+    }
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/core/utils.h
+++ b/Source/CNTKv2LibraryDll/proto/onnx/core/utils.h
@ -1,34 +1,46 @@
 #ifndef ONNXIR_UTILS_H
 #define ONNXIR_UTILS_H

+#include <mutex>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <string>

-class TensorProto;
-class TypeProto;
+#pragma warning(push)
+#pragma warning(disable : 4800 4610 4512 4510 4267 4127 4125 4100 4456 4189 4996)
+#include "proto/onnx/protobuf/graph.pb.h"
+#pragma warning(pop)

 namespace ONNXIR
 {
    typedef const std::string* PTYPE;
-
    namespace Utils
    {
+        class StringRange;
+
        class OpUtils
        {
        public:
            static PTYPE ToType(const TypeProto& p_type);
            static PTYPE ToType(const std::string& p_type);
            static const TypeProto& ToTypeProto(const PTYPE& p_type);
-            static std::string ToString(const TypeProto& p_type);
-            static std::string ToString(const TensorProto::DataType& p_type);
+            static std::string ToString(const TypeProto& p_type, const std::string& left = "", const std::string& right = "");
+            static std::string ToDataTypeString(const TensorProto::DataType& p_type);
+            static std::string ToAttrTypeString(const ValueProto& p_value, const std::string& left = "", const std::string& right = "");
            static void FromString(const std::string& p_src, TypeProto& p_type);
-            static void FromString(const std::string& p_src, TensorProto::DataType& p_type);
+            static void FromDataTypeString(const std::string& p_src, TensorProto::DataType& p_type);
            static bool IsValidDataTypeString(const std::string &p_dataType);
+            static void SplitStringTokens(StringRange& p_src, std::vector<StringRange>& p_tokens);
        private:
            static std::unordered_map<std::string, TypeProto>& GetTypeStrToProtoMap();
+            // Returns lock used for concurrent updates to TypeStrToProtoMap.
+            static std::mutex& GetTypeStrLock();
        };

+        // Simple class which contains pointers to external string buffer and a size.
+        // This can be used to track a "valid" range/slice of the string.
+        // Caller should ensure StringRange is not used after external storage has
+        // been freed.
        class StringRange
        {
        public:
@ -52,12 +64,31 @@ namespace ONNXIR
            bool RStrip(size_t p_size);
            bool RStrip(StringRange p_str);
            bool LAndRStrip();
+            void ParensWhitespaceStrip();
            size_t Find(const char p_ch) const;

+            // These methods provide a way to return the range of the string
+            // which was discarded by LStrip(). i.e. We capture the string
+            // range which was discarded.
+            StringRange GetCaptured();
+            void RestartCapture();
+
        private:
+            // m_data + size tracks the "valid" range of the external string buffer.
            const char* m_data;
            size_t m_size;
+
+            // m_start and m_end track the captured range.
+            // m_end advances when LStrip() is called.
+            const char* m_start;
+            const char* m_end;
        };
+
+        // Use this to avoid compiler warnings about unused variables. E.g., if
+        // a variable is only used in an assert when compiling in Release mode.
+        // Adapted from https://stackoverflow.com/questions/15763937/unused-parameter-in-c11
+        template<typename... Args>
+        void Ignore(Args&&...) {}
    }
 }

--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/activation/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/activation/defs.cpp
@ -0,0 +1,242 @@
+#include "proto/onnx/core/op.h"
+
+namespace ONNXIR {
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Sigmoid)
+        .Description("Sigmoid takes one input data (Tensor<T>) and produces one output data "
+            "(Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the "
+            "tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The sigmoid value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Tanh)
+        .Description("Calculates the hyperbolic tangent of the given input tensor element-wise. "
+            "This operation can be done in an in-place fashion too, by providing the same input "
+            "and output blobs.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The hyperbolic tangent value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Relu)
+        .Description("Relu takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the rectified linear function, y = max(0, x), is "
+            "applied to the tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The Relu value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(LeakyRelu)
+        .Description("LeakyRelu takes input data (Tensor<T>) and an argument alpha, "
+            "and produces one output data (Tensor<T>) where the function "
+            ":`f(x) = alpha * x for x < 0`, `f(x) = x for x >= 0`, is applied to the data "
+            "tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The LeakyRelu value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha","Coefficient of leakage", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(PRelu)
+        .Description("PRelu takes input data (Tensor<T>) and slope tensor as input, "
+            "and produces one output data (Tensor<T>) where the function "
+            "`f(x) = slope * x for x < 0`, `f(x) = x for x >= 0`., is applied to the "
+            "data tensor elementwise.")
+        .Input("X", "Input tensor", "T")
+        .Input("Slope", "Slope tensor. If `Slope` is of size 1, the value is shared"
+            "across different channels", "T")
+        .Output("Y", "The PRelu value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Elu)
+        .Description("Elu takes one input data (Tensor<T>) and produces one output data"
+            "(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x < 0`, "
+            "`f(x) = x for x >= 0`., is applied to the tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The elu value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Coefficient of ELU default to 1", AttrType::AttributeProto_AttributeType_FLOAT, float(1.0));
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Selu)
+        .Description("Selu takes one input data (Tensor<T>) and produces one output data "
+            "(Tensor<T>) where the scaled exponential linear unit function, "
+            "`y = gamma * (alpha * e^x - alpha) for x <= 0`, `f(x) = gamma * x for x > 0`, "
+            "is applied to the tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The selu value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Coefficient of SELU default to 1.6732.", AttrType::AttributeProto_AttributeType_FLOAT, float(1.6732))
+        .Attr("gamma", "Coefficient of SELU default to 1.0507.", AttrType::AttributeProto_AttributeType_FLOAT, float(1.0507));
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Softmax)
+        .Description("The operator computes the softmax normalized values for each layer in the batch "
+            "of the given input. The input is a 2-D tensor (Tensor<float>) of size "
+            "(batch_size x input_feature_dimensions). The output tensor has the same shape "
+            "and contains the softmax normalized values of the corresponding input. "
+            "                                                                            "
+            "X does not need to explicitly be a 2D vector; rather, it will be "
+            "coerced into one. For an arbitrary n-dimensional tensor "
+            "X in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is "
+            "the axis provided, then X will be coerced into a 2-dimensional tensor with "
+            "dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default "
+            "case where axis=1, this means the X tensor will be coerced into a 2D tensor "
+            "of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size. "
+            "In this situation, we must have a_0 = N and a_1 * ... * a_{n-1} = D. "
+            "Each of these dimensions must be matched correctly, or else the operator "
+            "will throw errors.")
+        .Input("input","The input tensor that's coerced into a 2D matrix of size (NxD) "
+            "as described above.", "T")
+        .Output("output", "The softmax normalized output values with the same "
+            "shape as input tensor.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "(int) default to 1; describes the axis of the inputs when coerced "
+            "to 2D; defaults to one because the 0th axis most likely describes "
+            "the batch_size", AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(Linear)
+        .Description("Linear takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the linear function, f(x)= alpha * x + beta is "
+            "applied to the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Scalar multiplication factor", AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("beta", "Scalar offset", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(HardSigmoid)
+        .Description("HardSigmoid takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the hard sigmoid function, f(x) = max⁡(0,min⁡(alpha*x+beta,1)), "
+            "is applied to the  tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Scaling value", AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("beta", "Scalar offset", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(ScaledTanh)
+        .Description("ScaledTanh takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the scaled hyperbolic tangent function, "
+            "f(x) = alpha*tanh⁡(beta*x), is applied to the  tensor elementwise.")
+        .Input("input", "Input tensor, typically 1-D.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Scaling value", AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("beta", "Scaling value", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(ThresholdedRelu)
+        .Description("Thresholded Relu takes input data (Tensor<T>) and threshold as input, and "
+            "produces one output data (Tensor<T>) where the function `f(x) = 0 for x < alpha, "
+            "x for x >= alpha`, is applied to the data tensor elementwise.")
+        .Input("input", "Input tensor, typically 1-D.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Scalar threshold value", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(LogSoftmax)
+        .Description("Log Softmax takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the function, y = log(1 / sum(exp(X)) * exp(x)), is applied "
+            "to the tensor elementwise.")
+        .Input("input", "The input tensor that's coerced into a 2D matrix of size (NxD) as "
+            "described above.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "(int) default to 1; describes the axis of the inputs when coerced "
+            "to 2D; defaults to one because the 0th axis most likely describes "
+            "the batch_size", AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(Hardmax)
+        .Description("Compute the hardmax normalized values for each layer in the batch "
+            "of the given input. The input is a 2-D tensor (Tensor<float>) of size "
+            "(batch_size x input_feature_dimensions). The output tensor has the same shape "
+            "and contains the softmax normalized values of the corresponding input. "
+            "\n"
+            "X does not need to explicitly be a 2D vector; rather, it will be coerced into "
+            "one. For an arbitrary n-dimensional tensor X in [a_0, a_1, ..., a_{k-1}, "
+            "a_k, ..., a_{n-1}] and k is the axis provided, then X will be coerced into a "
+            "2-dimensional tensor with dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. "
+            "For the default case where axis=1, this means the X tensor will be coerced into "
+            "a 2D tensor of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the "
+            "batch size.  In this situation, we must have a_0 = N and a_1 * ... * a_{n-1} = D. "
+            "Each of these dimensions must be matched correctly, or else the operator will "
+            "throw errors.")
+        .Input("input", "The input tensor that's coerced into a 2D matrix of size (NxD) as "
+            "described above.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "Default to 1; describes the axis of the inputs when coerced to 2D; "
+            "defaults to one because the 0th axis most likely describes the batch size.",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(Softsign)
+        .Description("Softsign takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the function, y = x / (1 + abs(x)), is applied to the "
+            "tensor elementwise.")
+        .Input("input", "Input tensor, typically 1-D.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Coefficient of SELU default to 1.6732.", AttrType::AttributeProto_AttributeType_FLOAT, float(1.6732));
+
+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(Softplus)
+        .Description("Softplus takes one input data (Tensor<T>) and produces one output "
+            "data (Tensor<T>) where the function, y = ln(1 + exp(steepness * x)), is "
+            "applied to the tensor elementwise.")
+        .Input("input", "Input tensor, typically 1-D.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("steepness", "Steepness (default to 1, must be > 1)", AttrType::AttributeProto_AttributeType_FLOAT, float(1.0));
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(ParametericSoftplus)
+        .Description("Softplus takes input data (Tensor<T>) and parametric tensors, "
+            "producing one output data (Tensor<T>) where the function, "
+            "y = alpha * log(1 + exp(beta * x), is applied to the tensor elementwise.")
+        .Input("input", "Input tensor, typically 1-D.", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("alpha", "Alpha tensor. If `alpha` is of size 1, "
+            "the value is shared across different channels.", AttrType::AttributeProto_AttributeType_FLOAT, float(1.0))
+        .Attr("beta", "Beta tensor. If `beta` is of size 1, "
+            "the value is shared across different channels.", AttrType::AttributeProto_AttributeType_FLOAT, float(1.0));
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(Identity)
+        .Description("Identity takes one input data (Tensor<T>) and produces one "
+            "output data (Tensor<T>) where the function, y = x, is applied to the "
+            "tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "output tensor", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/experiments/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/experiments/defs.cpp
@ -1,329 +0,0 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "proto/onnx/core/op.h"
-
-using SupportType = ONNXIR::OpSchema::SupportType;
-namespace ONNXIR {
-    REGISTER_OPERATOR_SCHEMA(ConstantFill)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(0, 1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-The operator fills the elements of the output tensor with a constant value
-specified by the 'value' argument.
-
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
-be one of the data types specified in the 'DataType' enum field in the
-TensorProto message. If the 'dtype' argument is not provided, the data type of
-'value' is used.
-
-The output tensor shape is specified by the 'shape' argument. If the number of
-input is 1, the shape will be identical to that of the input at run time with
-optional additional dimensions appended at the end as specified by 'extra_shape'
-argument. In that case the 'shape' argument should not be set.
-
-If input_as_shape is set to true, then the input should be a 1D tensor
-containing the desired output shape (the dimensions specified in extra_shape
-will also be appended)
-
-NOTE: Currently, it supports data type of float, int32, int64, and bool.
-)DOC")
-.Attr("value",
-    "The value for the elements of the output tensor.",
-    AttrType::FLOAT)
-        .Attr(
-            "dtype",
-            "The data type for the elements of the output tensor."
-            "Strictly must be one of the types from DataType enum in TensorProto.",
-            AttrType::INT)
-        .Attr(
-            "shape",
-            "The shape of the output tensor."
-            "Cannot set the shape argument and pass in an input at the same time.",
-            AttrType::INTS)
-        .Attr(
-            "extra_shape",
-            "The additional dimensions appended at the end of the shape indicated"
-            "by the input blob."
-            "Cannot set the extra_shape argument when there is no input blob.",
-            AttrType::INTS)
-        .Attr(
-            "input_as_shape",
-            "1D tensor containing the desired output shape.  First input must be in "
-            "CPU context.",
-            AttrType::INT)
-        .Input(0, "input", "Input tensor (optional) to provide shape information.")
-        .Output(
-            0,
-            "output",
-            "Output tensor of constant values specified by 'value'"
-            "argument and its type is specified by the 'dtype' argument");
-
-    REGISTER_OPERATOR_SCHEMA(Caffe2ConvTranspose)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(3)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-    The transposed convolution consumes an input vector, the filter blob, and
-    the bias blob, and computes the output. Note that other parameters, such as
-    the stride and kernel size, or the pads' sizes in each direction are not
-    necessary for input because they are provided by the
-    ConvTransposeUnpoolOpBase operator. Various dimension checks are done
-    implicitly, and the sizes are specified in the Input docs for this operator.
-    As is expected, the filter is deconvolved with a subset of the
-    image and the bias is added; this is done throughout the image data and the
-    output is computed. As a side note on the implementation layout:
-    conv_transpose_op_impl.h is the templated implementation of the
-    conv_transpose_op.h file, which is why they are separate files.
-  )DOC")
-        .Input(
-            0,
-            "X",
-            "Input data blob from previous layer; has size "
-            "(N x C x H x W), where N is the batch size, C is the number of channels, and"
-            " H and W are the height and width. Note that this is for the NCHW usage. On "
-            "the other hand, the NHWC Op has a different set of dimension constraints.")
-        .Input(
-            1,
-            "filter",
-            "The filter blob that will be used in the transposed "
-            "convolution; has size (M x C x kH x kW), where C is the number of channels,"
-            " and kH and kW are the height and width of the kernel.")
-        .Input(
-            2,
-            "bias",
-            "The 1D bias blob that is added through the convolution;"
-            "has size (C)")
-        .Output(
-            0,
-            "Y",
-            "Output data blob that contains the result of the "
-            "transposed convolution. The output dimensions are functions of the kernel"
-            " size, stride size, and pad lengths.")
-        .Attr("pads", "", AttrType::INTS)
-        .Attr("kernel_shape", "", AttrType::INTS)
-        .Attr("dilations", "", AttrType::INTS)
-        .Attr("group", "", AttrType::INT)
-        .Attr("strides", "", AttrType::INTS);
-
-    REGISTER_OPERATOR_SCHEMA(SpatialBN)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(5)
-        .NumOutputs({ 1, 5 })
-        .EnforceConsumed({ {3, 1}, {4, 2} })
-        .SetDoc(R"DOC(
-Carries out batch normalization as described in the paper
-https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
-there are multiple cases for the number of outputs, which we list below:
-
-Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
-Output case #2: Y (test mode)
-)DOC")
-.Attr("is_test",
-    "If set to nonzero, run spatial batch normalization in test mode.",
-    AttrType::INT)
-        .Attr("epsilon",
-            "The epsilon value to use to avoid division by zero.",
-            AttrType::FLOAT)
-        .Attr("momentum",
-            "Factor used in computing the running mean and variance."
-            "e.g., running_mean = running_mean * momentum + mean * (1 - momentum)",
-            AttrType::FLOAT)
-        .Input(0,
-            "X",
-            "The input 4-dimensional tensor of shape NCHW.")
-        .Input(1,
-            "scale",
-            "The scale as a 1-dimensional tensor of size C to be applied to the "
-            "output.")
-        .Input(2,
-            "bias",
-            "The bias as a 1-dimensional tensor of size C to be applied to the "
-            "output.")
-        .Input(3,
-            "mean",
-            "The running mean (training) or the estimated mean (testing) "
-            "as a 1-dimensional tensor of size C.")
-        .Input(4,
-            "var",
-            "The running variance (training) or the estimated "
-            "variance (testing) as a 1-dimensional tensor of size C.")
-        .Output(0, "Y", "The output 4-dimensional tensor of the same shape as X.")
-        .Output(1,
-            "mean",
-            "The running mean after the spatial BN operator. Must be in-place "
-            "with the input mean. Should not be used for testing.")
-        .Output(2,
-            "var",
-            "The running variance after the spatial BN operator. Must be "
-            "in-place with the input var. Should not be used for testing.")
-        .Output(3,
-            "saved_mean",
-            "Saved mean used during training to speed up gradient "
-            "computation. Should not be used for testing.")
-        .Output(4,
-            "saved_var",
-            "Saved variance used during training to speed up "
-            "gradient computation. Should not be used for testing.");
-
-    REGISTER_OPERATOR_SCHEMA(LRN)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(1)
-        .NumOutputs(1, 2)
-        .Attr("size", "", AttrType::INT)
-        .Attr("alpha", "", AttrType::FLOAT)
-        .Attr("beta", "", AttrType::FLOAT)
-        .Attr("bias", "", AttrType::FLOAT);
-
-    REGISTER_OPERATOR_SCHEMA(GivenTensorFill)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(0, 1)
-        .NumOutputs(1)
-        .Input(0, "shape", "The shape of filled tensor")
-        .Output(0, "X", "The filled tensor")
-        .Attr("values", "", AttrType::FLOATS)
-        .Attr("shape", "", AttrType::INTS)
-        .Attr("input_as_shape", "", AttrType::INT)
-        .Attr("extra_shape", "", AttrType::INTS)
-        .AllowConsumed({ {0, 0} });
-
-    REGISTER_OPERATOR_SCHEMA(FC)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(3)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Computes the result of passing an input vector X into a fully
-connected layer with 2D weight matrix W and 1D bias vector b. That is,
-the layer computes Y = X * W^T + b, where X has size (M x K),
-W has size (N x K), b has size (N), and Y has size (M x N),
-where M is often the batch size.
-NOTE: X does not need to explicitly be a 2D vector; rather, it will be
-coerced into one. For an arbitrary n-dimensional tensor
-X \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}] where a_i \in N+ and k is
-the axis provided, then X will be coerced into a 2-dimensional tensor with
-dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default
-case where axis=1, this means the X tensor will be coerced into a 2D tensor
-of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size.
-In this situation, we must have a_0 = M and a_1 * ... * a_{n-1} = K.
-Lastly, even though b is a 1D vector of size N, it is copied/resized to
-be size (M x N) implicitly and added to each vector in the batch.
-Each of these dimensions must be matched correctly, or else the operator
-will throw errors.
-)DOC")
-.Attr(
-    "axis",
-    "(int32_t) default to 1; describes the axis of the inputs; "
-    "defaults to one because the 0th axis most likely describes "
-    "the batch_size",
-    AttrType::INT)
-        .Attr(
-            "axis_w",
-            "(int32_t) default to 1; describes the axis of the weights; "
-            "defaults to one because the 0th axis most likely describes "
-            "the batch_size",
-            AttrType::INT)
-        .Input(
-            0,
-            "X",
-            "input tensor that's coerced into a 2D matrix of size (MxK) "
-            "as described above")
-        .Input(
-            1,
-            "W",
-            "2D blob of size (KxN) containing fully connected weight "
-            "matrix")
-        .Input(2, "b", "1D blob containing bias vector")
-        .Output(0, "Y", "2D output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Normalize)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Given a matrix, apply L2-normalization along the last dimension.
-)DOC");
-
-    REGISTER_OPERATOR_SCHEMA(Scale)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Scale takes one input data (Tensor<float>) and produces one output data
-(Tensor<float>) whose value is the input data tensor scaled element-wise.
-)DOC")
-.Attr("scale",
-    "(float, default 1.0) the scale to apply.",
-    AttrType::FLOAT);
-
-    REGISTER_OPERATOR_SCHEMA(ChannelShuffle)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .Attr("kernel_shape",
-            "The size of the kernel along each axis",
-            AttrType::INTS)
-        .Attr("group",
-            "Number of channel groups",
-            AttrType::INT);
-
-    REGISTER_OPERATOR_SCHEMA(RecurrentNetwork)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(1, INT_MAX)
-        .NumOutputs(2, INT_MAX)
-        .SetDoc(R"DOC(
-Run the input network in a recurrent fashion. This can be used to
-implement fairly general recurrent neural networks (RNNs).
-The operator proceeds as follows.
- First, initialized the states from the input recurrent states
- For each timestep T, apply the links (that map offsets from input/output
-tensors into the inputs/outputs for the `step` network)
- Finally, alias the recurrent states to the specified output blobs.
-This is a fairly special-case meta-operator, and so the implementation
-is somewhat complex. It trades of generality (and frankly usability)
-against performance and control (compared to e.g. TF
-dynamic_rnn, Theano scan, etc).
-See the usage examples for a flavor of how to use it.
-)DOC");
-
-    REGISTER_OPERATOR_SCHEMA(GRUUnit)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .NumInputs(4)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-GRUUnit computes the activations of a standard GRU,
-in a sequence-length aware fashion.
-Concretely, given the (fused) inputs X (TxNxD), the previous hidden
-state (NxD), and the sequence lengths (N), computes the GRU
-activations, avoiding computation if the input is invalid (as in, the
-value at X[t][n] >= seqLengths[n].
-)DOC")
-.Attr(
-    "drop_states",
-    "Bool to determine if hidden state is zeroes or passed "
-    "along for timesteps past the given sequence_length.",
-    AttrType::INT)
-        .Input(0, "hidden_prev", "The previous GRU hidden state.")
-        .Input(
-            1,
-            "gates",
-            "Unactivated gate outputs from forget, update, "
-            "and output gates, pre-activation.")
-        .Input(
-            2,
-            "seq_lengths",
-            "Array of sequence lengths.  "
-            "len(seq_lengths) should equal batch size N.")
-        .Input(3, "t", "The timestep for this operation.")
-        .Output(0, "hidden", "The new GRU hidden state calculated by this op.");
-
-    REGISTER_OPERATOR_SCHEMA(ATen)
-        .SetSupportLevel(SupportType::EXPERIMENTAL)
-        .AllowUncheckedAttributes()
-        .SetDoc(R"DOC(
-Experimental allowing ATen operations to be accessed directly from Caffe2
-to allow for quick prototyping when ONNX is missing standard versions of
-and op)DOC");
-}
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/generator/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/generator/defs.cpp
@ -6,163 +6,14 @@
 namespace ONNXIR
 {
    REGISTER_OPERATOR_SCHEMA(Constant)
-        .NumInputs(0)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(A constant tensor.)DOC")
-        .Attr("value",
+        .Description("A constant tensor.")
+        .Attr(
+            "value",
            "The value for the elements of the output tensor.",
-            AttrType::TENSOR)
-        .Output(
-            0,
-            "output",
-            "Output tensor containing the same value of the provided tensor.");
-
-    REGISTER_OPERATOR_SCHEMA(RandomUniform)
-        .NumInputs(0)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Generate a tensor with random values drawn from a uniform distribution. The shape
-of the tensor is specified by the `shape` argument and the range by `low` and `high`.
-
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
-be one of the data types specified in the 'DataType' enum field in the
-TensorProto message.
-)DOC")
-.Attr(
-    "low",
-    "Lower boundary of the output values.",
-    AttrType::FLOAT)
-        .Attr(
-            "high",
-            "Upper boundary of the output values.",
-            AttrType::FLOAT)
-        .Attr(
-            "seed",
-            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
-            AttrType::FLOAT)
-        .Attr(
-            "dtype",
-            "The data type for the elements of the output tensor.",
-            AttrType::INT)
-        .Attr(
-            "shape",
-            "The shape of the output tensor.",
-            AttrType::INTS)
-        .Output(
-            0,
-            "output",
-            "Output tensor of random values drawn from uniform distribution");
-
-    REGISTER_OPERATOR_SCHEMA(RandomNormal)
-        .NumInputs(0)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Generate a tensor with random values drawn from a normal distribution. The shape
-of the tensor is specified by the `shape` argument and the parameter of the normal distribution
-specified by `mean` and `scale`.
-
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
-be one of the data types specified in the 'DataType' enum field in the
-TensorProto message.
-)DOC")
-.Attr(
-    "mean",
-    "The mean of the normal distribution.",
-    AttrType::FLOAT)
-        .Attr(
-            "scale",
-            "The standard deviation of the normal distribution.",
-            AttrType::FLOAT)
-        .Attr(
-            "seed",
-            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
-            AttrType::FLOAT)
-        .Attr(
-            "dtype",
-            "The data type for the elements of the output tensor.",
-            AttrType::INT)
-        .Attr(
-            "shape",
-            "The shape of the output tensor.",
-            AttrType::INTS)
-        .Output(
-            0,
-            "output",
-            "Output tensor of random values drawn from normal distribution");
-
-    REGISTER_OPERATOR_SCHEMA(RandomUniformLike)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Generate a tensor with random values drawn from a uniform distribution. The shape
-of the tensor is computed from the input argument and the range by `low` and `high`.
-
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
-be one of the data types specified in the 'DataType' enum field in the
-TensorProto message.
-)DOC")
-.Attr(
-    "low",
-    "Lower boundary of the output values.",
-    AttrType::FLOAT)
-        .Attr(
-            "high",
-            "Upper boundary of the output values.",
-            AttrType::FLOAT)
-        .Attr(
-            "seed",
-            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
-            AttrType::FLOAT)
-        .Attr(
-            "dtype",
-            "(Optional) The data type for the elements of the output tensor, if not specified, we will use"
-            "the data type of the input tensor.",
-            AttrType::INT)
-        .Input(
-            0,
-            "input",
-            "Input tensor to provide shape information.")
-        .Output(
-            0,
-            "output",
-            "Output tensor of random values drawn from uniform distribution");
-
-    REGISTER_OPERATOR_SCHEMA(RandomNormalLike)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Generate a tensor with random values drawn from a normal distribution. The shape
-of the tensor is computed from the input argument and the parameter of the normal distribution
-specified by `mean` and `scale`.
-
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
-be one of the data types specified in the 'DataType' enum field in the
-TensorProto message.
-)DOC")
-.Attr(
-    "mean",
-    "The mean of the normal distribution.",
-    AttrType::FLOAT)
-        .Attr(
-            "scale",
-            "The standard deviation of the normal distribution.",
-            AttrType::FLOAT)
-        .Attr(
-            "seed",
-            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
-            AttrType::FLOAT)
-        .Attr(
-            "dtype",
-            "(Optional) The data type for the elements of the output tensor, if not specified, we will use"
-            "the data type of the input tensor.",
-            AttrType::INT)
-        .Input(
-            0,
-            "input",
-            "Input tensor to provide shape information.")
-        .Output(
-            0,
-            "output",
-            "Output tensor of random values drawn from normal distribution");
-
+            AttrType::AttributeProto_AttributeType_TENSOR)
+        .Output("output",
+                "Output tensor containing the same value of the provided tensor.",
+                "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/logical/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/logical/defs.cpp
@ -1,23 +1,42 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"

 namespace ONNXIR {

-std::function<void(OpSchema&)> BinaryLogicDocGenerator(const char* name) {
-    return [=](OpSchema& schema) {
-        std::string doc = R"DOC(
-Computes the `{name} than` elementwise logical operation between `left` and `right` input tensor. 
-The result is a tensor of type integer in which `0` mean false and `1` mean true.)DOC";
-        ReplaceAll(doc, "{name}", name);
-        schema.NumInputs(2);
-        schema.NumOutputs(1);
-        schema.SetDoc(doc);
-        schema.Input(0, "left", "Left input tensor for the logical operator.");
-        schema.Input(1, "right", "Right input tensor for the logical operator.");
-        schema.Output(0, "output", "Result tensor of type `int`, 0 mean False and 1 mean True.");
-    };
-}
+    #define REGISTER_BINARY_COMPARISON_OPERATOR_SCHEMA(OpName)                                                  \
+    REGISTER_OPERATOR_SCHEMA(OpName)                                                                            \
+        .Description("Computes the elementwise comparison `"#OpName"` between "                                 \
+            "`left` and `right` input tensor. The result is a tensor of type integer "                          \
+            "in which `0` mean false and `1` mean true.")                                                       \
+        .Input("left", "Left input tensor for the operator.", "T1")                                             \
+        .Input("right", "Right input tensor for the operator.", "T1")                                           \
+        .Output("output", "Result tensor of type `int`, 0 mean False and 1 mean True.", "T2")                   \
+        .TypeConstraint("T1", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input to float tensors.")                   \
+        .TypeConstraint("T2", { "tensor(int32)" }, "Constrain output types to int tensor.");

-}  // namespace ONNXIR
+    //‘GREATER’, ‘LESS’, ‘EQUALS,
+    REGISTER_BINARY_COMPARISON_OPERATOR_SCHEMA(GT)
+    REGISTER_BINARY_COMPARISON_OPERATOR_SCHEMA(LE)
+    REGISTER_BINARY_COMPARISON_OPERATOR_SCHEMA(EQ)
+
+    #define REGISTER_BINARY_LOGIC_OPERATOR_SCHEMA(OpName)                                                       \
+    REGISTER_OPERATOR_SCHEMA(OpName)                                                                            \
+        .Description("Computes the elementwise logical operation '"#OpName"' between "                          \
+            "`left` and `right` input tensor. The result is a tensor of type integer "                          \
+            "in which `0` mean false and `1` mean true.")                                                       \
+        .Input("left", "Left input tensor for the logical operator.", "T")                                      \
+        .Input("right", "Right input tensor for the logical operator.", "T")                                    \
+        .Output("output", "Result tensor of type `int`, 0 mean False and 1 mean True.", "T")                    \
+        .TypeConstraint("T", { "tensor(int32)" }, "Constrain input and output types to int tensor.");
+
+    // ‘AND, ‘OR’, ‘XOR’
+    REGISTER_BINARY_LOGIC_OPERATOR_SCHEMA(And)
+    REGISTER_BINARY_LOGIC_OPERATOR_SCHEMA(Or)
+    REGISTER_BINARY_LOGIC_OPERATOR_SCHEMA(Xor)
+
+    REGISTER_OPERATOR_SCHEMA(Not)
+        .Description("Performs element-wise negation.")
+        .Input("X", "Input tensor of type bool.", "T")
+        .Output("Y", "  Output tensor of type bool.", "T")
+        .TypeConstraint("T", { "tensor(int32)" }, "Constrain input and output types to int tensor.");
+
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/math/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/math/defs.cpp
@ -1,385 +1,170 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"
-#include <functional>
-
-using AttrType = ONNXIR::AttrType;

 namespace ONNXIR {

-    const char* kBroadcastDoc = R"DOC(
-If necessary the right-hand-side argument will be broadcasted to match the
-shape of left-hand-side argument. When broadcasting is specified, the second
-tensor can either be of size 1 (a scalar value), or having its shape as a
-contiguous subset of the first tensor's shape. The starting of the mutually
-equal shape is specified by the argument "axis", and if it is not set, suffix
-matching is assumed. 1-dim expansion doesn't work yet.
+    #define REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(OpName)                                                    \
+    REGISTER_OPERATOR_SCHEMA(OpName)                                                                        \
+        .Description("Elementwise "#OpName" takes one or more input data (Tensor<T>) and produces one "     \
+            "output data (Tensor<T>) where the declared function is applied to the input "                  \
+            "tensors elementwise.")                                                                         \
+        .Input("data_0", "First of the input tensors. Can be inplace.", "T")                                \
+        .Output("output", "Output tensor. Same dimension as inputs.", "T")                                  \
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },                                              \
+            "Constrain input and output types to float tensors.");

-For example, the following tensor shapes are supported (with broadcast=1):
-
-  shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar
-  shape(A) = (2, 3, 4, 5), shape(B) = (5,)
-  shape(A) = (2, 3, 4, 5), shape(B) = (4, 5)
-  shape(A) = (2, 3, 4, 5), shape(B) = (3, 4), with axis=1
-  shape(A) = (2, 3, 4, 5), shape(B) = (2), with axis=0
-
-Attribute `broadcast=1` needs to be passed to enable broadcasting.
-)DOC";
-
-    std::function<void(OpSchema&)> MathDocGenerator(const char* name) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
-Performs element-wise binary {name} (with limited broadcast support).
-{broadcast_doc})DOC";
-            ReplaceAll(doc, "{name}", name);
-            ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
-            schema.SetDoc(doc);
-            schema.Attr("broadcast",
-                "Pass 1 to enable broadcasting",
-                AttrType::INT);
-            schema.Attr("axis",
-                "If set, defines the broadcast dimensions. See doc for details.",
-                AttrType::INT);
-            schema.Input(
-                0,
-                "A",
-                "First operand, should share the type with the second operand.");
-            schema.Input(
-                1,
-                "B",
-                "Second operand. With broadcasting can be of smaller size than A. "
-                "If broadcasting is disabled it should be of the same size.");
-            schema.Output(0, "C", "Result, has same dimensions and type as A");
-        };
-    }
-
-    REGISTER_OPERATOR_SCHEMA(Add)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0}, {1, 0} })
-        .FillUsing(MathDocGenerator("addition"));
-
-    REGISTER_OPERATOR_SCHEMA(Sub)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0}, {1, 0} })
-        .FillUsing(MathDocGenerator("subtraction"));
-
-    REGISTER_OPERATOR_SCHEMA(Mul)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0}, {1, 0} })
-        .FillUsing(MathDocGenerator("multiplication"));
-
-    REGISTER_OPERATOR_SCHEMA(Div)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0}, {1, 0} })
-        .FillUsing(MathDocGenerator("division"));
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Add)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Sub)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Mul)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Div)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Max)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Min)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Sum)
+    REGISTER_ELEMENTWISE_OPERATOR_SCHEMA(Mean)

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Neg)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Neg takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where each element flipped sign, y = -x, is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
+        .Description("Neg takes one input data (Tensor<T>) and produces one output data \
+            (Tensor<T>) where each element flipped sign, y = -x, is applied to \
+            the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Abs)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Absolute takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the absolute is, y = abs(x), is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
+        .Description("Absolute takes one input data (Tensor<T>) and produces one output data "
+            "(Tensor<T>) where the absolute is, y = abs(x), is applied to "
+            "the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+             "Constrain input and output types to float tensors.");

+    // Take from ONNX
    REGISTER_OPERATOR_SCHEMA(Reciprocal)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Reciprocal takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the reciprocal is, y = 1/x, is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
+        .Description("Reciprocal takes one input data (Tensor<T>) and produces one output data "
+            "(Tensor<T>) where the reciprocal is, y = 1/x, is applied to "
+            "the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Floor)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Floor takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the floor is, y = floor(x), is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
+        .Description("Floor takes one input data (Tensor<T>) and produces one output data "
+            "(Tensor<T>) where the floor is, y = floor(x), is applied to "
+            "the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Ceil)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Ceil takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the ceil is, y = ceil(x), is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
+        .Description("Ceil takes one input data (Tensor<T>) and produces one output data"
+            "(Tensor<T>) where the ceil is, y = ceil(x), is applied to"
+            "the tensor elementwise.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(Clip)
+        .Description("Clip operator limits the given input within an interval. "
+            "The interval is specified with arguments 'min' and 'max'. They default to "
+            "numeric_limits::lowest() and numeric_limits::max() respectively. The clipping "
+            "operation can be done in in-place fashion too, where the input and output blobs "
+            "are the same.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("min", "Minimum value, under which element is replaced by min", AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("max", "Maximum value, under which element is replaced by max", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Sqrt)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Square root takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the square root is, y = x^0.5, is applied to
-the tensor elementwise. If x is negative, then it will return NaN.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Relu)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Relu takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the rectified linear function, y = max(0, x), is applied to
-the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(LeakyRelu)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .Attr("alpha",
-            "Coefficient of leakage",
-            AttrType::FLOAT)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-LeakyRelu takes input data (Tensor<T>) and an argument alpha, and produces one
-output data (Tensor<T>) where the function `f(x) = alpha * x for x < 0`,
-`f(x) = x for x >= 0`, is applied to the data tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Selu)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .Attr("alpha",
-            "Coefficient of SELU default to 1.6732.",
-            AttrType::FLOAT)
-        .Attr("gamma",
-            "Coefficient of SELU default to 1.0507.",
-            AttrType::FLOAT)
-        .SetDoc(R"DOC(
-Selu takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the scaled exponential linear unit function,
-`y = gamma * (alpha * e^x - alpha) for x <= 0`, `f(x) = gamma * x for x > 0`,
-is applied to the tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Elu)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-
-Elu takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
-0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
-
-)DOC")
-.Input(0, "X", "1D input tensor")
-.Output(0, "Y", "1D input tensor");
+        .Description("Square root takes one input data (Tensor<T>) and produces one output "
+            "data Tensor<T>) where the square root is, y = x^0.5, is applied to "
+            "the tensor elementwise. If x is negative, then it will return NaN.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Exp)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Calculates the exponential of the given input tensor, element-wise. This
-operation can be done in an in-place fashion too, by providing the same input
-and output blobs.
-)DOC")
-.Input(0, "input", "Input tensor")
-.Output(
-    0,
-    "output",
-    "The exponential of the input tensor computed "
-    "element-wise");
+        .Description("Calculates the exponential of the given input tensor, element-wise. "
+            "This operation can be done in an in-place fashion too, by providing the same "
+            "input and output blobs.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The exponential of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Log)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Calculates the natural log of the given input tensor, element-wise. This
-operation can be done in an in-place fashion too, by providing the same input
-and output blobs.
-)DOC")
-.Input(0, "input", "Input tensor")
-.Output(
-    0,
-    "output",
-    "The natural log of the input tensor computed "
-    "element-wise");
-
-    REGISTER_OPERATOR_SCHEMA(Tanh)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Calculates the hyperbolic tangent of the given input tensor element-wise. This
-operation can be done in an in-place fashion too, by providing the same input
-and output blobs.
-)DOC")
-.Input(0, "input", "1-D input tensor")
-.Output(0, "output", "The hyperbolic tangent values of the input tensor "
-    "computed element-wise");
+        .Description("Calculates the natural log of the given input tensor, element-wise. "
+            "This operation can be done in an in-place fashion too, by providing the same "
+            "input and output blobs.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The natural  log of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Pow)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .Attr("exponent",
-            "The exponent of the power function.",
-            AttrType::FLOAT)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Pow takes input data (Tensor<T>) and an argument exponent, and
-produces one output data (Tensor<T>) where the function `f(x) = x^exponent`,
-is applied to the data tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor of any shape")
-.Output(0, "Y", "Output tensor (same size as X)");
+        .Description("Pow takes input data (Tensor<T>) and an argument exponent, and "
+            "produces one output data (Tensor<T>) where the function `f(x) = x^exponent`, "
+            "is applied to the data tensor elementwise.")
+        .Input("input", "input tensor", "T")
+        .Output("output", "The x^exponent value of the input tensor computed element-wise", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("exponent", "The exponent of the power function.", AttrType::AttributeProto_AttributeType_FLOAT);

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Dot)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Apply dot product between 2 tensors. Similar to numpy implementation:
-https://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html
-)DOC")
-.Input(0, "X", "Input tensor of any shape")
-.Input(1, "Y", "Input tensor of any shape")
-.Output(0, "Z", "Output tensor the dot product between X and Y.");
-
-    REGISTER_OPERATOR_SCHEMA(PRelu)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-
-PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
-output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
-`f(x) = x for x >= 0`., is applied to the data tensor elementwise.
-
-)DOC")
-.Input(0, "X", "Input tensor")
-.Input(
-    1,
-    "Slope",
-    "Slope tensor. If `Slope` is of size 1, the value is shared"
-    "across different channels")
-        .Output(0, "Y", "Input tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Sigmoid)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Sigmoid takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the
-tensor elementwise.
-)DOC")
-.Input(0, "X", "Input tensor")
-.Output(0, "Y", "Output tensor");
-
-    REGISTER_OPERATOR_SCHEMA(Max)
-        .NumInputs(1, INT_MAX)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Element-wise max of each of the input tensors. The first input tensor can be
-used in-place as the output tensor, in which case the max will be done in
-place and results will be accumulated in input0. All inputs and outputs must
-have the same shape and data type.
-)DOC")
-.Input(0, "data_0", "First of the input tensors. Can be inplace.")
-.Output(0, "max", "Output tensor. Same dimension as inputs.");
-
-    REGISTER_OPERATOR_SCHEMA(Min)
-        .NumInputs(1, INT_MAX)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Element-wise min of each of the input tensors. The first input tensor can be
-used in-place as the output tensor, in which case the max will be done in
-place and results will be accumulated in input0. All inputs and outputs must
-have the same shape and data type.
-)DOC")
-.Input(0, "data_0", "First of the input tensors. Can be inplace.")
-.Output(0, "max", "Output tensor. Same dimension as inputs.");
-
-    REGISTER_OPERATOR_SCHEMA(Sum)
-        .NumInputs(1, INT_MAX)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Element-wise sum of each of the input tensors. The first input tensor can be
-used in-place as the output tensor, in which case the sum will be done in
-place and results will be accumulated in input0. All inputs and outputs must
-have the same shape and data type.
-)DOC")
-.Input(0, "data_0", "First of the input tensors. Can be inplace.")
-.Output(0, "sum", "Output tensor. Same dimension as inputs.");
-
-    REGISTER_OPERATOR_SCHEMA(Softmax)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-The operator computes the softmax normalized values for each layer in the batch
- of the given input. The input is a 2-D tensor (Tensor<float>) of size
-(batch_size x input_feature_dimensions). The output tensor has the same shape
-and contains the softmax normalized values of the corresponding input.
-
-X does not need to explicitly be a 2D vector; rather, it will be
-coerced into one. For an arbitrary n-dimensional tensor
-X \in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is
-the axis provided, then X will be coerced into a 2-dimensional tensor with
-dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default
-case where axis=1, this means the X tensor will be coerced into a 2D tensor
-of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size.
-In this situation, we must have a_0 = N and a_1 * ... * a_{n-1} = D.
-Each of these dimensions must be matched correctly, or else the operator
-will throw errors.
-)DOC")
-.Attr("axis",
-    "(int) default to 1; describes the axis of the inputs when coerced "
-    "to 2D; defaults to one because the 0th axis most likely describes "
-    "the batch_size",
-    AttrType::INT)
-        .Input(0, "input",
-            "The input tensor that's coerced into a 2D matrix of size (NxD) "
-            "as described above.")
-        .Output(0, "output", "The softmax normalized output values with the same "
-            "shape as input tensor.");
+        .Description("Apply dot product between 2 tensors. Similar to numpy implementation: "
+            "https://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html")
+        .Input("X", "Input tensor of any shape", "T")
+        .Input("Y", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor the dot product between X and Y.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Gemm)
+        .Description("(General Matrix multiplication: "
+            "https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3 "
+            "Compute Y = alpha * A * B + beta * C, where input tensor A has dimension (M X K), "
+            "input tensor B has dimension (K X N), input tensor C and output tensor Y have "
+            "dimension (M X N). Input tensor C can be used inplace as the output tensor Y. "
+            "If attribute broadcast is non-zero, input tensor C will be broadcasted to match the "
+            "dimension requirement. If A can be transposed before doing the computation if "
+            "attribute transA is non-zero, same for B and transB. ")
+        .Input("A", "Input tensor A", "T")
+        .Input("B", "Input tensor B", "T")
+        .Input("C", "Input tensor C, can be inplace.", "T")
+        .Output("Y", "Output tensor.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("transA",
+              "Whether A should be transposed",
+              AttrType::AttributeProto_AttributeType_INT)
+        .Attr("transB",
+              "Whether B should be transposed",
+              AttrType::AttributeProto_AttributeType_INT)
+        .Attr("broadcast",
+              "Whether C should be broadcasted",
+              AttrType::AttributeProto_AttributeType_INT)
+        .Attr("alpha",
+              "Scalar multiplier for the product of input tensors A * B",
+              AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("beta",
+              "Scalar multiplier for input tensor C",
+              AttrType::AttributeProto_AttributeType_FLOAT);
 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/nn/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/nn/defs.cpp
@ -1,309 +1,514 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"

+
 namespace ONNXIR {
-    std::function<void(OpSchema&)> AveragePoolOpSchemaGenerator(const char* name) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
- {name} consumes an input tensor X and applies average pooling across the
- the tensor according to kernel sizes, stride sizes, and pad lengths.
- Average pooling consisting of averaging all values of a subset of the
- input tensor according to the kernel size and downsampling the
- data into the output tensor Y for further processing.)DOC";
-            ReplaceAll(doc, "{name}", name);
-            schema.SetDoc(doc);
-            schema.NumInputs(1);
-            schema.NumOutputs(1);
-            schema.Attr("kernel_shape",
-                        "The size of the kernel along each axis.",
-                        AttrType::INTS);
-            schema.Attr("strides",
-                        "Stride along each axis.",
-                        AttrType::INTS);
-            schema.Attr("pads",
-                        "Padding along each axis, can take the value 0 (False) or non 0 (True)",
-                        AttrType::INTS);
-            schema.Input(0,
-                         "X",
-                         "Input data tensor from the previous operator; dimensions for image case "
-                         "are (N x C x H x W), where N is the batch size, C is the number of channels, "
-                         "and H and W are the height and the width of the data. For non image case, the "
-                         "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.");
-            schema.Output(0,
-                          "Y",
-                          "Output data tensor from average pooling across the input "
-                          "tensor. Dimensions will vary based on various kernel, stride, and pad "
-                          "sizes.");
-        };
-    }
-
-    REGISTER_OPERATOR_SCHEMA(AveragePool)
-        .FillUsing(AveragePoolOpSchemaGenerator("AveragePool"));
-
-    std::function<void(OpSchema&)> MaxPoolOpSchemaGenerator(const char* name) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
- {name} consumes an input tensor X and applies max pooling across the
- the tensor according to kernel sizes, stride sizes, and pad lengths.
- Average pooling consisting of averaging all values of a subset of the
- input tensor according to the kernel size and downsampling the
- data into the output tensor Y for further processing.)DOC";
-            ReplaceAll(doc, "{name}", name);
-            schema.SetDoc(doc);
-            schema.NumInputs(1);
-            schema.NumOutputs(1);
-            schema.Attr("kernel_shape",
-                        "The size of the kernel along each axis.",
-                        AttrType::INTS);
-            schema.Attr("strides",
-                        "Stride along each axis.",
-                        AttrType::INTS);
-            schema.Attr("pads",
-                        "Padding along each axis, can take the value 0 (False) or non 0 (True)",
-                        AttrType::INTS);
-            schema.Attr("dilations",
-                        "Dilaton along each axis, 1 mean no dilation.",
-                        AttrType::INTS);
-            schema.Input(0,
-                         "X",
-                         "Input data tensor from the previous operator; dimensions for image case "
-                         "are (N x C x H x W), where N is the batch size, C is the number of channels, "
-                         "and H and W are the height and the width of the data. For non image case, the "
-                         "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.");
-            schema.Output(0,
-                          "Y",
-                          "Output data tensor from max pooling across the input "
-                          "tensor. Dimensions will vary based on various kernel, stride, and pad "
-                          "sizes.");
-        };
-    }
-
-    REGISTER_OPERATOR_SCHEMA(MaxPool)
-        .FillUsing(MaxPoolOpSchemaGenerator("MaxPool"));
-
-    std::function<void(OpSchema&)> ConvOpSchemaGenerator(const char* filter_desc) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
-The convolution operator consumes an input tensor and {filter_desc}, and
-computes the output.)DOC";
-            ReplaceAll(doc, "{filter_desc}", filter_desc);
-            schema.SetDoc(doc);
-            schema.NumInputs(2, 3);
-            schema.NumOutputs(1);
-            schema.Input(0,
-                         "X",
-                         "Input data tensor from previous layer; has size (N x C x H x W)"
-                         ", where N is the batch size, C is the number of channels, and"
-                         " H and W are the height and width. Note that this is for the 2D image."
-                         "Otherwise the size is (N x D1 x D2 ... x Dn)");
-            schema.Input(1,
-                         "filter",
-                         "The filter blob that will be used in the convolutions; "
-                         "has size (M x C x kH x kW), where C is the number of channels, "
-                         "and kH and kW are the height and width of the kernel.");
-            schema.Output(0,
-                          "Y",
-                          "Output data tensor that contains the result of the convolution. The "
-                          "output dimensions are functions of the kernel size, stride size, "
-                          "and pad lengths.");
-            schema.Attr("kernel_shape",
-                        "The shape of the convolution kernel.",
-                         AttrType::INTS);
-            schema.Attr("dilations",
-                        "dilation value along each axis of the filter.",
-                        AttrType::INTS);
-            schema.Attr("strides",
-                        "stride along each axis.",
-                        AttrType::INTS);
-            schema.Attr("pads",
-                        "Padding along each axis, can take the value 0 (False) or non 0 (True)",
-                        AttrType::INTS);
-            schema.Attr("group",
-                        "number of groups input channels and output channels are divided into",
-                        AttrType::INT);
-        };
-    }
+    REGISTER_OPERATOR_SCHEMA(FC)
+        .Description("Computes the result of passing an input vector X into a fully"
+            "connected layer with 2D weight matrix W and 1D bias vector b.That is, "
+            "the layer computes Y = X * W^T + b, where X has size(M x K), "
+            "W has size(N x K), b has size(N), and Y has size(M x N), "
+            "where M is often the batch size.")
+        .Input("X", "input tensor that's coerced into a 2D matrix of size (MxK) ", "T")
+        .Input("W", "A tensor that is coerced into a 2D blob of size (KxN) containing fully connected weight matrix", "T")
+        .Input("b", "1D blob containing bias vector", "T")
+        .Output("Y", "output tensor", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and output types to float tensors.")
+        .Attr("axis",
+            "(int32_t) default to 1; describes the axis of the inputs; "
+            "defaults to one because the 0th axis most likely describes the batch_size",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1))
+        .Attr("axis_w",
+            "(int32_t) default to 1; describes the axis of the weight matrix W; "
+            "defaults to one because the 0th axis most likely describes the batch_size",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Conv)
-        .FillUsing(ConvOpSchemaGenerator("a filter"));
-
-
-    std::function<void(OpSchema&)> ConvTransposeOpSchemaGenerator(const char* filter_desc) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
-The convolution transpose operator consumes an input tensor and {filter_desc},
-and computes the output.)DOC";
-            ReplaceAll(doc, "{filter_desc}", filter_desc);
-            schema.SetDoc(doc);
-            schema.NumInputs(2);
-            schema.NumOutputs(1);
-            schema.Input(0,
-                         "X",
+        .Description("The convolution operator consumes an input tensor and a filter, and"
+            "computes the output.")
+        .Input("X",
             "Input data tensor from previous layer; has size (N x C x H x W)"
             ", where N is the batch size, C is the number of channels, and"
             " H and W are the height and width. Note that this is for the 2D image."
-                         "Otherwise the size is (N x D1 x D2 ... x Dn)");
-            schema.Input(1,
-                         "filter",
-                         "The filter blob that will be used in the convolutions; "
-                         "has size (M x C x kH x kW), where C is the number of channels, "
-                         "and kH and kW are the height and width of the kernel.");
-            schema.Output(0,
-                          "Y",
+             "Otherwise the size is (N x D1 x D2 ... x Dn)",
+             "T")
+        .Input("weights",
+             "The weight tensor that will be used in the convolutions; has size (M x C x kH x kW), "
+             "where C is the number of channels, and kH and kW are the height and width of the kernel, "
+             "and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be "
+             "(M x C x k1 x k2 x ... x kn), where is the dimension of the kernel",
+             "T")
+        .Input("bias",
+            "Optional 1D bias to be added to the convolution, has size of M.",
+            "T")
+        .Output("Y",
              "Output data tensor that contains the result of the convolution. The "
              "output dimensions are functions of the kernel size, stride size, "
-                          "and pad lengths.");
-            schema.Attr("kernel_shape",
+              "and pad lengths.",
+              "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("auto_pad",
+            "auto_pad must be either SAME_UPPER, SAME_LOWER or VALID. Where SAME_UPPER "
+            "or SAME_LOWER mean pad the input so that the ouput size match the input. "
+            "In case of odd number add the extra padding at the end for SAME_UPPER and "
+            "at the begining for SAME_LOWER. VALID mean no padding, therefore, read the "
+            "pixel values from the pads attribute.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_shape",
            "The shape of the convolution kernel.",
-                         AttrType::INTS);
-            schema.Attr("output_shape",
-                        "The shape of the output.",
-                        AttrType::INTS);
-            schema.Attr("dilations",
+             AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("dilations",
            "dilation value along each axis of the filter.",
-                        AttrType::INTS);
-            schema.Attr("strides",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("strides",
            "stride along each axis.",
-                        AttrType::INTS);
-            schema.Attr("pads",
-                        "Padding along each axis, can take the value 0 (False) or non 0 (True)",
-                        AttrType::INTS);
-        };
-    }
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("pads",
+            "Padding for lower and upper side along each axis, it can take any value greater "
+            "than or equal to 0. The value represent the number of pixels added to the lower "
+            "and upper part of the corresponding axis. So `pads` will have two values per axis, "
+            "first value corresponding to the number of pixels added to the begining of the "
+            "axis and the second value corresponding to the number of pixels add at the end "
+            "of the axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("group",
+            "number of groups input channels and output channels are divided into",
+            AttrType::AttributeProto_AttributeType_INT);

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(ConvTranspose)
-        .FillUsing(ConvTransposeOpSchemaGenerator("a filter"));
-
-
-    std::function<void(OpSchema&)> GlobalPoolingOpSchemaGenerator(const char* op_type, const char* op) {
-        return [=](OpSchema& schema) {
-            std::string doc = R"DOC(
- Global{op_type} consumes an input tensor X and applies {op} pooling across the
- the values in the same channel. This is equivalent to {op_type} with kernel size
- equal to the spatial dimension of input tensor.)DOC";
-            ReplaceAll(doc, "{op_type}", op_type);
-            ReplaceAll(doc, "{op}", op);
-            schema.SetDoc(doc);
-            schema.NumInputs(1);
-            schema.NumOutputs(1);
-            schema.Input(0,
-                "X",
-                "Input data tensor from the previous operator; dimensions for image case "
-                "are (N x C x H x W), where N is the batch size, C is the number of channels, "
-                "and H and W are the height and the width of the data. For non image case, the "
-                "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.");
-            schema.Output(0,
-                "Y",
-                "Output data tensor from pooling across the input "
-                "tensor. Dimensions will be N x C x 1 x 1");
-            schema.SetDoc(doc);
-        };
-    }
-    REGISTER_OPERATOR_SCHEMA(GlobalAveragePool)
-        .FillUsing(GlobalPoolingOpSchemaGenerator("AveragePool", "average"));
-    REGISTER_OPERATOR_SCHEMA(GlobalMaxPool)
-        .FillUsing(GlobalPoolingOpSchemaGenerator("MaxPool", "max"));
-
-    REGISTER_OPERATOR_SCHEMA(BatchNormalization)
-        .NumInputs(5)
-        .NumOutputs({ 1, 5 })
-        .EnforceConsumed({ {3, 1}, {4, 2} })
-        .SetDoc(R"DOC(
-Carries out batch normalization as described in the paper
-https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
-there are multiple cases for the number of outputs, which we list below:
-
-Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
-Output case #2: Y (test mode)
-)DOC")
-.Attr("spatial",
-    "Compute the mean and variance across all spatial elements or per feature.",
-    AttrType::INT)
-        .Attr("is_test",
-            "If set to nonzero, run spatial batch normalization in test mode.",
-            AttrType::INT)
-        .Attr("epsilon",
-            "The epsilon value to use to avoid division by zero.",
-            AttrType::FLOAT)
-        .Attr("momentum",
-            "Factor used in computing the running mean and variance."
-            "e.g., running_mean = running_mean * momentum + mean * (1 - momentum)",
-            AttrType::FLOAT)
-        .Input(0,
-            "X",
-            "The input 4-dimensional tensor of shape NCHW or NHWC depending "
-            "on the order parameter.")
-        .Input(1,
-            "scale",
-            "The scale as a 1-dimensional tensor of size C to be applied to the "
-            "output.")
-        .Input(2,
-            "bias",
-            "The bias as a 1-dimensional tensor of size C to be applied to the "
-            "output.")
-        .Input(3,
-            "mean",
-            "The running mean (training) or the estimated mean (testing) "
-            "as a 1-dimensional tensor of size C.")
-        .Input(4,
-            "var",
-            "The running variance (training) or the estimated "
-            "variance (testing) as a 1-dimensional tensor of size C.")
-        .Output(0, "Y", "The output 4-dimensional tensor of the same shape as X.")
-        .Output(1,
-            "mean",
-            "The running mean after the BatchNormalization operator. Must be in-place "
-            "with the input mean. Should not be used for testing.")
-        .Output(2,
-            "var",
-            "The running variance after the BatchNormalization operator. Must be "
-            "in-place with the input var. Should not be used for testing.")
-        .Output(3,
-            "saved_mean",
-            "Saved mean used during training to speed up gradient "
-            "computation. Should not be used for testing.")
-        .Output(4,
-            "saved_var",
-            "Saved variance used during training to speed up "
-            "gradient computation. Should not be used for testing.");
+        .Description("The convolution transpose operator consumes an input tensor and a filter,"
+            "and computes the output.")
+        .Input("X",
+             "Input data tensor from previous layer; has size (N x C x H x W)"
+             ", where N is the batch size, C is the number of channels, and"
+             " H and W are the height and width. Note that this is for the 2D image."
+             "Otherwise the size is (N x D1 x D2 ... x Dn)",
+             "T")
+        .Input("weights",
+             "The weight tensor that will be used in the convolutions; has size (C x M x kH x kW), "
+             "where C is the number of channels, and kH and kW are the height and width of the kernel, "
+             "and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be "
+             "(M x C x k1 x k2 x ... x kn), where is the dimension of the kernel",
+             "T")
+        .Input("bias",
+            "Optional 1D bias to be added to the convolution, has size of C.",
+            "T")
+        .Output("Y",
+              "Output data tensor that contains the result of the convolution. The "
+              "output dimensions are functions of the kernel size, stride size, "
+              "and pad lengths.",
+              "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("auto_pad",
+            "auto_pad must be either SAME_UPPER, SAME_LOWER or VALID. Where SAME_UPPER "
+            "or SAME_LOWER mean pad the input so that the ouput size match the input. "
+            "In case of odd number add the extra padding at the end for SAME_UPPER and "
+            "at the begining for SAME_LOWER. VALID mean no padding, therefore, read the "
+            "pixel values from the pads attribute.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_shape",
+            "The shape of the convolution kernel.",
+             AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("output_shape",
+            "The shape of the output.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("dilations",
+            "dilation value along each axis of the filter.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("strides",
+            "stride along each axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("pads",
+            "Padding for lower and upper side along each axis, it can take any value greater "
+            "than or equal to 0. The value represent the number of pixels added to the lower "
+            "and upper part of the corresponding axis. So `pads` will have two values per axis, "
+            "first value corresponding to the number of pixels added to the begining of the "
+            "axis and the second value corresponding to the number of pixels add at the end "
+            "of the axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("group",
+            "number of groups input channels and output channels are divided into",
+            AttrType::AttributeProto_AttributeType_INT);

    REGISTER_OPERATOR_SCHEMA(Dropout)
-        .NumInputs(1)
-        .NumOutputs({ 1,2 })
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Dropout takes one input data (Tensor<float>) and produces two Tensor outputs,
-output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in
-test mode or not, the output Y will either be a random dropout, or a simple
-copy of the input. Note that our implementation of Dropout does scaling in
-the training phase, so during testing nothing needs to be done.
-)DOC")
+        .Description("Dropout takes one input data (Tensor<float>) and produces two Tensor outputs, "
+            "output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in "
+            "test mode or not, the output Y will either be a random dropout, or a simple "
+            "copy of the input. Note that our implementation of Dropout does scaling in "
+            "the training phase, so during testing nothing needs to be done.")
+        .Input("data", "The input data as Tensor.", "T")
+        .Output("output", "The output.", "T")
+        .Output("mask",
+            "The output mask. If is_test is nonzero, this output is not filled.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
        .Attr("ratio",
            "(float, default 0.5) the ratio of random dropout",
-    AttrType::FLOAT)
+            AttrType::AttributeProto_AttributeType_FLOAT, float(0.5))
        .Attr("is_test",
            "(int, default 0) if nonzero, run dropout in test mode where "
            "the output is simply Y = X.",
-            AttrType::INT)
-        .Input(0, "data", "The input data as Tensor.")
-        .Output(0, "output", "The output.")
-        .Output(1, "mask",
-            "The output mask. If is_test is nonzero, this output is not filled.");
+            AttrType::AttributeProto_AttributeType_INT, int64_t(0));
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(AveragePool)
+        .Description("AveragePool consumes an input tensor X and applies average pooling across the"
+            "the tensor according to kernel sizes, stride sizes, and pad lengths."
+            "Average pooling consisting of averaging all values of a subset of the"
+            "input tensor according to the kernel size and downsampling the"
+            "data into the output tensor Y for further processing.")
+        .Input("X",
+            "Input data tensor from the previous operator; dimensions for image case "
+            "are (N x C x H x W), where N is the batch size, C is the number of channels, "
+            "and H and W are the height and the width of the data. For non image case, the "
+            "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.",
+            "T")
+        .Output("Y",
+            "Output data tensor from average pooling across the input tensor. "
+            "Dimensions will vary based on various kernel, stride, and pad sizes.")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("auto_pad",
+            "auto_pad must be either SAME_UPPER, SAME_LOWER or VALID. Where SAME_UPPER "
+            "or SAME_LOWER mean pad the input so that the ouput size match the input. "
+            "In case of odd number add the extra padding at the end for SAME_UPPER and "
+            "at the begining for SAME_LOWER. VALID mean no padding, therefore, read the "
+            "pixel values from the pads attribute.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_shape",
+            "The size of the kernel along each axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("pads",
+            "Padding along each axis, can take the value 0 (False) or non 0 (True)",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("strides",
+            "Stride along each axis.",
+            AttrType::AttributeProto_AttributeType_INTS);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(GlobalAveragePool)
+        .Description("GlobalAveragePool consumes an input tensor X and applies average "
+            "pooling across the values in the same channel. This is equivalent to "
+            "AveragePool with kernel size equal to the spatial dimension of input tensor.")
+        .Input("X",
+            "Input data tensor from the previous operator; dimensions for image case "
+            "are (N x C x H x W), where N is the batch size, C is the number of channels, "
+            "and H and W are the height and the width of the data. For non image case, the "
+            "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.",
+            "T")
+        .Output("Y",
+            "Output data tensor from pooling across the input tensor. Dimensions will "
+            "be N x C x 1 x 1")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(MaxPool)
+        .Description("MaxPool consumes an input tensor X and applies max pooling across the"
+            "the tensor according to kernel sizes, stride sizes, and pad lengths."
+            "Average pooling consisting of averaging all values of a subset of the"
+            "input tensor according to the kernel size and downsampling the"
+            "data into the output tensor Y for further processing.")
+        .Input("X",
+            "Input data tensor from the previous operator; dimensions for image case "
+            "are (N x C x H x W), where N is the batch size, C is the number of channels, "
+            "and H and W are the height and the width of the data. For non image case, the "
+            "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.",
+            "T")
+        .Output("Y",
+            "Output data tensor from max pooling across the input tensor. "
+            "Dimensions will vary based on various kernel, stride, and pad sizes.",
+            "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("auto_pad",
+            "auto_pad must be either SAME_UPPER, SAME_LOWER or VALID. Where SAME_UPPER "
+            "or SAME_LOWER mean pad the input so that the ouput size match the input. "
+            "In case of odd number add the extra padding at the end for SAME_UPPER and "
+            "at the begining for SAME_LOWER. VALID mean no padding, therefore, read the "
+            "pixel values from the pads attribute.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_shape",
+            "The size of the kernel along each axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("strides",
+            "Stride along each axis.",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("pads",
+            "Padding along each axis, can take the value 0 (False) or non 0 (True)",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("dilations",
+            "Dilaton along each axis, 1 mean no dilation.",
+            AttrType::AttributeProto_AttributeType_INTS);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(GlobalMaxPool)
+        .Description("GlobalMaxPool consumes an input tensor X and applies max pooling "
+            "across the values in the same channel. This is equivalent to MaxPool "
+            "with kernel size equal to the spatial dimension of input tensor.")
+        .Input("X",
+            "Input data tensor from the previous operator; dimensions for image case "
+            "are (N x C x H x W), where N is the batch size, C is the number of channels, "
+            "and H and W are the height and the width of the data. For non image case, the "
+            "dimension are in the form of (N x D1 x D2 ... Dn), where N is the batch size.",
+            "T")
+        .Output("Y",
+            "Output data tensor from pooling across the input tensor. Dimensions will "
+            "be N x C x 1 x 1")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(BatchNormalization)
+        .Description("Carries out batch normalization as described in the paper"
+            "https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,"
+            "there are multiple cases for the number of outputs, which we list below:"
+            ""
+            "Output case #1: Y, mean, var, saved_mean, saved_var (training mode)"
+            "Output case #2: Y (test mode)")
+        .Input("X",
+            "The input 4-dimensional tensor of shape NCHW or NHWC depending "
+            "on the order parameter.",
+            "T")
+        .Input("scale",
+            "The scale as a 1-dimensional tensor of size C to be applied to the "
+            "output.",
+            "T")
+        .Input("bias",
+            "The bias as a 1-dimensional tensor of size C to be applied to the "
+            "output.",
+            "T")
+        .Input("mean",
+            "The running mean (training) or the estimated mean (testing) "
+            "as a 1-dimensional tensor of size C.",
+            "T")
+        .Input("var",
+            "The running variance (training) or the estimated "
+            "variance (testing) as a 1-dimensional tensor of size C.",
+            "T")
+        .Output("Y", "The output 4-dimensional tensor of the same shape as X.",
+            "T")
+        .Output("mean",
+            "The running mean after the BatchNormalization operator. Must be in-place "
+            "with the input mean. Should not be used for testing.",
+            "T")
+        .Output("var",
+            "The running variance after the BatchNormalization operator. Must be "
+            "in-place with the input var. Should not be used for testing.",
+            "T")
+        .Output("saved_mean",
+            "Saved mean used during training to speed up gradient "
+            "computation. Should not be used for testing.",
+            "T")
+        .Output("saved_var",
+            "Saved variance used during training to speed up "
+            "gradient computation. Should not be used for testing.",
+            "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("epsilon",
+            "The epsilon value to use to avoid division by zero.",
+            AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("is_test",
+            "If set to nonzero, run spatial batch normalization in test mode.",
+            AttrType::AttributeProto_AttributeType_INT)
+        .Attr("momentum",
+            "Factor used in computing the running mean and variance."
+            "e.g., running_mean = running_mean * momentum + mean * (1 - momentum)",
+            AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("spatial",
+            "Compute the mean and variance across all spatial elements or per feature.",
+            AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(RoIPool)
+        .Description("Carries out ROI Pooling for Faster-RCNN. Depending on the mode, "
+            "there are multiple output cases: "
+            "Output case #1: Y, argmaxes (train mode)"
+            "Output case #2: Y           (test mode)")
+        .Input("X", "The input 4-D tensor of data. Only NCHW order is currently supported.", "T")
+        .Input("rois", "RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of "
+            "shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].", "T")
+        .Output("Y", "RoI pooled output 4-D tensor of shape "
+            "(num_rois, channels, pooled_h, pooled_w).", "T")
+        .Output("argmaxes", "Argmaxes corresponding to indices in X used for gradient "
+            "computation. Only output if arg “is_test” is false.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("is_test", "If set, run in test mode and skip computation of argmaxes (used "
+            "for gradient computation). Only one output tensor is produced. (Default: false).",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(0))
+        .Attr("spatial_scale", "Multiplicative spatial scale factor to translate ROI "
+            "coordinates from their input scale to the scale used when pooling (Default: 1.0).",
+            AttrType::AttributeProto_AttributeType_FLOAT, float(1.0))
+        .Attr("pooled_h", "The pooled output height (Default: 1).",
+            AttrType::AttributeProto_AttributeType_FLOAT, float(1.0))
+        .Attr("pooled_w", "The pooled output width (Default: 1).",
+            AttrType::AttributeProto_AttributeType_FLOAT, float(1.0));
+
+    REGISTER_OPERATOR_SCHEMA(LpPool)
+        .Description("LpPool consumes an input blob X and applies L-p pooling across the "
+            "blob according to kernel sizes, stride sizes, and pad lengths defined by the "
+            "ConvPoolOpBase operator. L-p pooling consisting of taking the L-p norm of a "
+            "subset of the input tensor according to the kernel size and downsampling the "
+            "data into the output blob Y for further processing.")
+        .Input("X", "X Input data tensor from the previous operator; dimensions depend on "
+            "whether the NCHW or NHWC operators are being used. For example, in the former, "
+            "the input has size (N x C x H x W), where N is the batch size, C is the number "
+            "of channels, and H and W are the height and the width of the data. The "
+            "corresponding permutation of dimensions is used in the latter case.", "T")
+        .Output("Y", "Y Output data tensor from L-p pooling across the input tensor. "
+            "Dimensions will vary based on various kernel, stride, and pad sizes.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("auto_pad",
+            "auto_pad must be either SAME_UPPER, SAME_LOWER or VALID. Where SAME_UPPER "
+            "or SAME_LOWER mean pad the input so that the ouput size match the input. "
+            "In case of odd number add the extra padding at the end for SAME_UPPER and "
+            "at the begining for SAME_LOWER. VALID mean no padding, therefore, read the "
+            "pixel values from the pads attribute.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_shape", "The size of the kernel along each axis.", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("strides", "Stride along each axis.", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("pads", "Padding along each axis, can take the value 0 (False) or non 0 (True)",
+            AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("p", "Value of p, default 2.0.", AttrType::AttributeProto_AttributeType_FLOAT, float(2.0));
+
+    REGISTER_OPERATOR_SCHEMA(GlobalLpPool)
+        .Description("GlobalLpPool consumes an input tensor X and applies lp-pool across the "
+            "values in the same channel. This is equivalent to LpPool with kernel size equal "
+            "to the spatial dimension of input tensor.")
+        .Input("X", "X Input data tensor from the previous operator; dimensions depend on "
+            "whether the NCHW or NHWC operators are being used. For example, in the former, "
+            "the input has size (N x C x H x W), where N is the batch size, C is the number "
+            "of channels, and H and W are the height and the width of the data. The "
+            "corresponding permutation of dimensions is used in the latter case.", "T")
+        .Output("Y", "Y Output data tensor from L-p pooling across the input tensor. Dimensions will "
+            "vary based on various kernel, stride, and pad sizes.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and output types to float tensors.")
+        .Attr("p", "Value of p, default 2.0.", AttrType::AttributeProto_AttributeType_FLOAT, float(2.0));
+
+    REGISTER_OPERATOR_SCHEMA(LRN)
+        .Description("Perform local response normalization. "
+            "NOTE: Only supports Caffe across channel mode. ")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and output "
+             " types to float tensors.")
+        .Attr("size", "[default 5]: the number of channels to sum over (for cross "
+              "channel LRN) or the side length of the square region to sum over (for within "
+              "channel LRN)", AttrType::AttributeProto_AttributeType_INT, int64_t(5))
+        .Attr("alpha", "Scalar scaling factor. Default is 0.0001", AttrType::AttributeProto_AttributeType_FLOAT, float(0.0001))
+        .Attr("beta", "Scalar exponent in the LRN.  Default is 0.5.", AttrType::AttributeProto_AttributeType_FLOAT, float(0.5))
+        .Attr("bias", "An offset (must be positive to avoid dividing by 0). Defaults to 1.0.",
+            AttrType::AttributeProto_AttributeType_FLOAT, float(1.0));
+
+    REGISTER_OPERATOR_SCHEMA(MVN)
+        .Description("Perform mean variance normalization.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and output "
+            "types to float tensors.")
+        .Attr("across_channels", "If true, mean and variance are computed across channels. "
+            "Default is false.", AttrType::AttributeProto_AttributeType_INT, int64_t(0))
+        .Attr("normalize_variance", "If false, normalize the mean only. Default is true.",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));
+
+    REGISTER_OPERATOR_SCHEMA(L2Normalization)
+        .Description("Perform L2 normalization  Divide each element by the square root of the "
+            "sum of squares of all elements in the input tensor.")
+        .Input("input", "Input tensor of any shape", "T")
+        .Output("output", "Output tensor of same shape and type as input X.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(float)" }, "Constrain input and output "
+            "types to float tensors.")
+        .Attr("Axis", "Axis along which to perform normalization.", AttrType::AttributeProto_AttributeType_INT);
+
+    // Take from RS4
+    REGISTER_OPERATOR_SCHEMA(Embedding)
+        .Description("Turns positive integers (indexes) into dense vectors of fixed size. "
+            "eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] "
+            "TODO: Omits use of CoreML bias parameter.")
+        .Input("input", "1-D tensor of integers representing indices in the embedding "
+            "dictionary with length [N] and values [0, input_dim -1]", "T1")
+        .Output("output", "Output tensor of computed features [N, O].", "T2")
+        .TypeConstraint("T1", { "tensor(uint64)" }, "Constrain input types to ints.")
+        .TypeConstraint("T2", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+                "Constrain output types to float tensors.")
+        .Attr("input_dim", "Size of the input vocabulary.", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("output_dim", "Dimension of the embedding output vectors.", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("weights", "2-D tensor of weights [O,I]", AttrType::AttributeProto_AttributeType_FLOATS);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(ImageScaler)
+        .Description("Alteration of image by scaling its individual values. "
+            "NOTE: The current definition assumes that the bias values are stored in the "
+            "same ordering as the image pixel format.")
+        .Input("input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output("output", "Result, has same shape and type as X", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and "
+            "output types to float tensors.")
+        .Attr("bias", "Bias values for each channel, of shape [C]", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("scale", "Scalar channel factor, elementwise mutliplied into every value in [C,H,W]",
+            AttrType::AttributeProto_AttributeType_FLOAT);
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(Upsample)
+        .Description("Scale up spatial dimensions.  Use interpolation to fill in values")
+        .Input("input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output("output", "Result, has same shape and type as X", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and "
+            "output types to float tensors.")
+        .Attr("mode", "enum {'NN', 'BILINEAR' }, Nearest neighbor or bilinear upsampling.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("width_scale", "Scale along width dimension", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("height_scale", "Scale along height dimension", AttrType::AttributeProto_AttributeType_INT);
+
+    REGISTER_OPERATOR_SCHEMA(Crop)
+        .Description("Crop and image to the specified spatial dimensions.  If scale is given,"
+            "then optionally start the crop offset by the left/top border amounts.  "
+            "If scale is not provided, crop the borders as provided.")
+        .Input("input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output("output", "Result, has same type as X, with H and W dimensions reduced.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" }, "Constrain input and "
+            "output types to float tensors.")
+        .Attr("border", "A 1-D tensor of values (leftBorder, topBorder, rightBorder, bottomBorder)",
+            AttrType::AttributeProto_AttributeType_INT)
+        .Attr("scale", "A 1-D tensor of values (height, width)", AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Pad)
+        .Description("Given data tensor, paddings, mode, and value. "
+            "Example: Insert 0 paddings to the beginning of the second dimension. "
+            "data = [ [1.0, 1.2], [2.3, 3.4], [4.5, 5.7], ] paddings = [0, 0, 2, 0] "
+            "output = [ [ [0.0, 0.0, 1.0, 1.2], [0.0, 0.0, 2.3, 3.4], [0.0, 0.0, 4.5, 5.7] ] ]")
+        .Input("data", "Input tensor.", "T")
+        .Output("output", "Tensor after padding.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("paddings",
+              "List of integers indicate the padding sizes, paddings's length "
+              "should be the double of input's dimension. "
+              "The order should be axis_0_begin, axis_0_end, axis_1_begin, ..., "
+              "axis_n_begin, axis_n_end, n is input's dimension.",
+              AttrType::AttributeProto_AttributeType_INTS, int64_t(1))
+        .Attr("mode",
+              "Three modes: constant(default), reflect, edge",
+              AttrType::AttributeProto_AttributeType_STRING, std::string("constant"))
+        .Attr("value",
+              "One float, indicates the value to be filled, default is 0",
+              AttrType::AttributeProto_AttributeType_FLOAT, float(0));
+
+    // Taken from RS4
+    REGISTER_OPERATOR_SCHEMA(MeanSubtraction)
+        .Description("Subtracts the provided mean image from the input image.")
+        .Input("input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output("output", "Result, has same shape and type as X", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("image", "Image tensor stored as a sequence of floats [C,H,W].", AttrType::AttributeProto_AttributeType_TENSOR);

-    REGISTER_OPERATOR_SCHEMA(Flatten)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Flattens the input tensor into a 2D matrix, keeping the first dimension
-unchanged.
-)DOC")
-.Input(0, "input", "A tensor of rank >= 2.")
-.Output(
-    0,
-    "output",
-    "A tensor of rank 2 with the contents of the input tensor, "
-    "with first dimension equal first dimension of input, and remaining "
-    "input dimensions flatenned into the inner dimension of the output.");
 }
+
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/reduction/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/reduction/defs.cpp
@ -1,91 +1,66 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"
-#include <functional>

 namespace ONNXIR {

-std::function<void(OpSchema&)> ReduceDocGenerator(const char* name) {
-    return [=](OpSchema& schema) {
-        std::string doc = R"DOC(
-Computes the {name} of the input tensor's element along the provided axes. The resulted
-tensor has the same shape as the input if keepdims equal 1. If keepdims equal 0, then 
-the resulted tensor have the reduced dimension pruned.
+    #define REGISTER_REDUCE_OPERATOR_SCHEMA(OpName)                                                         \
+    REGISTER_OPERATOR_SCHEMA(OpName)                                                                        \
+        .Description("Computes the "#OpName" of the input tensor's element along the provided axes. "       \
+            "The resulted tensor has the same shape as the input if keepdims equal 1. If keepdims "         \
+            "equal 0, then the resulted tensor have the reduced dimension pruned. "                         \
+                                                                                                            \
+            "The above behavior is similar to numpy, with the exception that numpy default keepdims "       \
+            "to False instead of True.")                                                                    \
+        .Input("input", "Input tensor to be reduced.", "T")                                                 \
+        .Output("output", "Output tensor.", "T")                                                            \
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },                                              \
+            "Constrain input and output types to float tensors.")                                                  \
+        .Attr("axis", "A list of axes to reduce into.", AttrType::AttributeProto_AttributeType_INTS)                                     \
+        .Attr("keepdims", "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",      \
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));
+
+    // Taken from ONNX
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceSum)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceMean)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceProd)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceMax)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceMin)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceLogSumExp)
+
+    // Taken from RS4
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceLogSum)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceSumSquare)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceL1)
+    REGISTER_REDUCE_OPERATOR_SCHEMA(ReduceL2)
+
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Argmax)
+        .Description("Computes the indices of the max elements of the input tensor's element "
+            "along the provided axes. The resulted tensor has the same shape as the input if "
+            "keepdims equal 1. If keepdims equal 0, then the resulted tensor have the reduced "
+            "dimension pruned. The type of the output tensor is integer.")
+        .Input("input", "Input tensor.", "T1")
+        .Output("output", "Output tensor.", "T2")
+        .TypeConstraint("T1", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T2", { "tensor(int64)" }, "Constrain output types to int64 tensors.")
+        .Attr("axis", "The axis in which to compute the arg indices.", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("keepdims", "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Argmin)
+        .Description("Computes the indices of the min elements of the input tensor's element "
+            "along the provided axes. The resulted tensor has the same shape as the input if "
+            "keepdims equal 1. If keepdims equal 0, then the resulted tensor have the reduced "
+            "dimension pruned. The type of the output tensor is integer.")
+        .Input("input", "Input tensor.", "T1")
+        .Output("output", "Output tensor.", "T2")
+        .TypeConstraint("T1", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input types to float tensors.")
+        .TypeConstraint("T2", { "tensor(int64)" }, "Constrain output types to int64 tensors.")
+        .Attr("axis", "The axis in which to compute the arg indices.", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("keepdims", "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(1));

-The above behavior is similar to numpy, with the exception that numpy default keepdims to
-False instead of True.)DOC";
-        ReplaceAll(doc, "{name}", name);
-        schema.SetDoc(doc);
-        schema.Attr("axes",
-                    "A list of integers, along which to reduce max.",
-                    AttrType::INTS);
-        schema.Attr("keepdims",
-                    "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
-                    AttrType::INT);
-        schema.Input(0, "data", "An input tensor.");
-        schema.Output(0, "reduced", "Reduced output tensor.");
-    };
 }
-  
-REGISTER_OPERATOR_SCHEMA(ReduceMax)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("max"));
-
-REGISTER_OPERATOR_SCHEMA(ReduceMin)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("min"));
-
-REGISTER_OPERATOR_SCHEMA(ReduceSum)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("sum"));
-
-REGISTER_OPERATOR_SCHEMA(ReduceMean)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("mean"));
-
-REGISTER_OPERATOR_SCHEMA(ReduceProd)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("product"));
-
-REGISTER_OPERATOR_SCHEMA(ReduceLogSumExp)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ReduceDocGenerator("log sum exponent"));
-
-std::function<void(OpSchema&)> ArgReduceDocGenerator(const char* name) {
-    return [=](OpSchema& schema) {
-        std::string doc = R"DOC(
-Computes the indices of the {name} elements of the input tensor's element along the 
-provided axes. The resulted tensor has the same shape as the input if keepdims equal 1. 
-If keepdims equal 0, then the resulted tensor have the reduced dimension pruned. 
-The type of the output tensor is integer.)DOC";
-        ReplaceAll(doc, "{name}", name);
-        schema.SetDoc(doc);
-        schema.Attr("axes",
-                    "A list of integers, along which to reduce max.",
-                    AttrType::INTS);
-        schema.Attr("keepdims",
-                    "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
-                    AttrType::INT);
-        schema.Input(0, "data", "An input tensor.");
-        schema.Output(0, "reduced", "Reduced output tensor with integer data type.");
-    };
-}
-
-REGISTER_OPERATOR_SCHEMA(ArgMax)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ArgReduceDocGenerator("max"));
-
-REGISTER_OPERATOR_SCHEMA(ArgMin)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .FillUsing(ArgReduceDocGenerator("min"));
-
-}  // namespace ONNXIR
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/rnn/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/rnn/defs.cpp
@ -1,6 +1,240 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"

+namespace ONNXIR {

+    std::function<void(OperatorSchemaSetter&)> RNNDocGeneratorInputX() {
+        return [=](OperatorSchemaSetter& schema) {
+            schema.Input("X",
+                "The input sequences packed (and potentially padded) into one 3-D "
+                "tensor with the shape of `[seq_length, batch_size, input_size]`.", "T");
+            schema.TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+                "Constrain input and output types to float tensors.");
+        };
+    }
+
+    std::function<void(OperatorSchemaSetter&)> RNNDocGeneratorInputSeqLen() {
+        return [=](OperatorSchemaSetter& schema) {
+            schema.Input("sequence_lens",
+                "Optional tensor specifying lengths of the sequences in a batch. "
+                "If not specified - assumed all sequences in the batch to have "
+                "length `seq_length`. It has shape `[batch_size]`.", "T1", true /*optional*/);
+            schema.TypeConstraint("T1", { "tensor(int32)" }, "Constrain seq_lens to integer tensor.");
+        };
+    }
+
+    std::function<void(OperatorSchemaSetter&)> RNNDocGeneratorInputInitialH() {
+        return [=](OperatorSchemaSetter& schema) {
+            schema.Input("initial_h",
+                "Optional initial value of the hidden. If not specified - assumed "
+                "to be 0. It has shape `[num_directions, batch_size, hidden_size]`.", "T", true /*optional*/);
+            schema.TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+                "Constrain input and output types to float tensors.");
+        };
+    }
+
+    std::function<void(OperatorSchemaSetter&)> RNNDocGeneratorAttrOutput() {
+        return [=](OperatorSchemaSetter& schema) {
+            schema.Attr("direction", "Specify if the RNN is forward, reverse, or bidirectional. "
+                "Must be one of forward (default), reverse, or bidirectional.",
+                AttrType::AttributeProto_AttributeType_STRING);
+            schema.Attr("hidden_size", "Number of neurons in the hidden layer",
+                AttrType::AttributeProto_AttributeType_INT);
+            schema.Output("Y",
+                "A tensor that concats all the intermediate output values of the hidden."
+                "It has shape `[seq_length, num_directions, batch_size, hidden_size]`.", "T");
+            schema.Output("Y_h",
+                "The last output value of the hidden. It has shape "
+                "`[num_directions, batch_size, hidden_size]`.", "T");
+            schema.TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+                "Constrain input and output types to float tensors.");
+        };
+    }
+
+    std::function<void(OperatorSchemaSetter&)> RNNDocGeneratorActivationArgs() {
+        return [=](OperatorSchemaSetter& schema) {
+            schema.Attr("activation_args", 
+                "A list of potential float arguments for an activation function, if this one requires any. "
+                "If multiple activations are specified, the order of values in this "
+                "list is the same as the order of activation functions.",
+                AttrType::AttributeProto_AttributeType_FLOATS);
+        };
+    }
+
+    REGISTER_OPERATOR_SCHEMA(RNN)
+        .Description(R"DOC(
+            Computes an one-layer simple RNN. This operator is usually supported
+            via some custom implementation such as CuDNN.
+
+            Notations:
+            `X` - input tensor
+            `i` - input gate
+            `t` - time step (t-1 means previous time step)
+            `Wi` - W parameter weight matrix for input gate
+            `Ri` - R recurrence weight matrix for input gate
+            `Wbi` - W parameter bias vector for input gate
+            `Rbi` - R parameter bias vector for input gate
+            `WBi` - W parameter weight matrix for backward input gate
+            `RBi` - R recurrence weight matrix for backward input gate
+            `WBbi` - WR bias vectors for backward input gate
+            `RBbi` - RR bias vectors for backward input gate
+            `ReLU(X)` - max(X, 0)
+            `tanh(X)` - hyperbolic tangent of X
+            `H` - Hidden state
+            `num_directions` - 2 if direction == bidirectional else 1
+
+            Equations:
+              - Ht = Activation(Wi*Xt + Ri*Ht-1 + Wbi + Rbi)
+            )DOC")
+        .FillUsing(RNNDocGeneratorInputX())
+        .Input("W",
+            "The weight tensor for input gate. Concatenation of `Wi` and `WBi` "
+            "(if bidirectional). The tensor has shape "
+            "`[num_directions, hidden_size, input_size]`.", "T")
+        .Input("R",
+            "The recurrence weight tensor. Concatenation of `Ri` and `RBi` "
+            "(if bidirectional). The tensor has shape "
+            "`[num_directions, hidden_size, hidden_size]`.", "T")
+        .Input("B",
+            "The bias tensor for input gate. Concatenation of `[Wbi, Rbi]` "
+            "and `[WBbi, RBbi]` (if bidirectional). The tensor has shape "
+            "`[num_directions, 2*hidden_size]`, Optional: If not specified - assumed "
+            "to be 0.", "T", 
+            true)
+        .FillUsing(RNNDocGeneratorInputSeqLen())
+        .FillUsing(RNNDocGeneratorInputInitialH())
+        .Attr("activation", "One (or two if bidirectional) activation function for "
+            "input gate. It must be one of tanh and ReLU. Default `tanh`.",
+            AttrType::AttributeProto_AttributeType_STRING)
+        .FillUsing(RNNDocGeneratorActivationArgs())
+        .FillUsing(RNNDocGeneratorAttrOutput());
+
+    REGISTER_OPERATOR_SCHEMA(GRU)
+        .Description(R"DOC(
+            Computes an one-layer GRU. This operator is usually supported via some custom
+            implementation such as CuDNN.
+
+            Notations:
+            `X` - input tensor
+            `z` - update gate
+            `r` - reset gate
+            `h` - hidden gate
+            `t` - time step (t-1 means previous time step)
+            `W[zrh]` - W parameter weight matrix for update, reset, and hidden gates
+            `R[zrh]` - R recurrence weight matrix for update, reset, and hidden gates
+            `Wb[zrh]` - W bias vectors for update, reset, and hidden gates
+            `Rb[zrh]` - R bias vectors for update, reset, and hidden gates
+            `WB[zrh]` - W parameter weight matrix for backward update, reset, and hidden gates
+            `RB[zrh]` - R recurrence weight matrix for backward update, reset, and hidden gates
+            `WBb[zrh]` - W bias vectors for backward update, reset, and hidden gates
+            `RBb[zrh]` - R bias vectors for backward update, reset, and hidden gates
+            `tanh(X)` - hyperbolic tangent of X
+            `sigmoid(X)` - 1 / (1 + e^-X)
+            `H` - Hidden state
+            `num_directions` - 2 if direction == bidirectional else 1
+
+            Equations (GRU with default activations):
+              - zt = sigmoid(Wz*Xt + Rz*Ht-1 + Wbz + Rbz)
+              - rt = sigmoid(Wr*Xt + Rr*Ht-1 + Wbr + Rbr)
+              - ht = tanh(Wh*Xt + rt*(Rh*Ht-1 + Rbh) + Wbh)
+              - H = (1 - zt) (.) ht + it (.) Ht-1
+            )DOC")
+        .FillUsing(RNNDocGeneratorInputX())
+        .Input("W",
+            "The weight tensor for the gates. Concatenation of `W[zrh]` and `WB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, input_size]`.", "T")
+        .Input("R",
+            "The recurrence weight tensor. Concatenation of `R[zrh]` and `RB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, hidden_size]`.", "T")
+        .Input("B",
+            "The bias tensor for the gates. Concatenation of `[Wb[zrh], Rb[zrh]]` and "
+            "`[WBb[zrh], RBb[zrh]]` (if bidirectional) along dimension 0. This tensor "
+            "has shape `[num_directions, 6*hidden_size]`. Optional: If not specified "
+            "- assumed to be 0", "T",
+            true /*optional*/)
+        .FillUsing(RNNDocGeneratorInputSeqLen())
+        .FillUsing(RNNDocGeneratorInputInitialH())
+        .Attr("activations", "A list of 3 (or 6 if bidirectional) activation functions "
+            "for update, reset, and hidden gates. The activation functions must be "
+            "one of sigmoid and tanh. See the equations for default.",
+            AttrType::AttributeProto_AttributeType_STRINGS)
+        .FillUsing(RNNDocGeneratorActivationArgs())
+        .FillUsing(RNNDocGeneratorAttrOutput());
+
+
+    REGISTER_OPERATOR_SCHEMA(LSTM)
+        .Description(R"DOC(
+            Computes an one-layer LSTM. This operator is usually supported via some
+            custom implementation such as CuDNN.
+
+            Notations:
+            `X` - input tensor
+            `i` - input gate
+            `o` - output gate
+            `f` - forget gate
+            `c` - cell gate
+            `t` - time step (t-1 means previous time step)
+            `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates
+            `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates
+            `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates
+            `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates
+            `P[iof]`  - P peephole weight vector for input, output, and forget gates
+            `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates
+            `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates
+            `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates
+            `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates
+            `PB[iof]`  - P peephole weight vector for backward input, output, and forget gates
+            `tanh(X)` - hyperbolic tangent of X
+            `sigmoid(X)` - 1 / (1 + e^-X)
+            `H` - Hidden state
+            `num_directions` - 2 if direction == bidirectional else 1
+
+            Equations (forward LSTM with default activations and peepholes):
+              - it = sigmoid(Wi*Xt + Ri*Ht-1 + Pi (.) Ct-1 + Wbi + Rbi)
+              - ft = sigmoid(Wf*Xt + Rf*Ht-1 + Pf (.) Ct-1 + Wbf + Rbf)
+              - ct = tanh(Wc*Xt + Rc*Ht-1 + Wbc + Rbc)
+              - Ct = ft (.) Ct-1 + it (.) ct
+              - ot = sigmoid(Wo*Xt + Ro*Ht-1 + Po (.) Ct + Wbo + Rbo)
+              - H = ot (.) tanh(Ct)
+            )DOC")
+        .FillUsing(RNNDocGeneratorInputX())
+        .Input("W",
+            "The weight tensor for the gates. Concatenation of `W[zrh]` and `WB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, input_size]`.", "T")
+        .Input("R",
+            "The recurrence weight tensor. Concatenation of `R[zrh]` and `RB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, hidden_size]`.", "T")
+        .Input("B",
+            "The bias tensor for the gates. Concatenation of `[Wb[zrh], Rb[zrh]]` and "
+            "`[WBb[zrh], RBb[zrh]]` (if bidirectional) along dimension 0. This tensor "
+            "has shape `[num_directions, 6*hidden_size]`. Optional: If not specified "
+            "- assumed to be 0", "T",
+            true /*optional*/)
+        .FillUsing(RNNDocGeneratorInputSeqLen())
+        .FillUsing(RNNDocGeneratorInputInitialH())
+        .Attr("activations", "A list of 3 (or 6 if bidirectional) activation functions "
+            "for update, reset, and hidden gates. The activation functions must be "
+            "one of sigmoid and tanh. See the equations for default.",
+            AttrType::AttributeProto_AttributeType_STRINGS)
+        .FillUsing(RNNDocGeneratorActivationArgs())
+        .Attr("clip", "Cell clip threshold. Clipping bounds the elements of a tensor "
+            "in the range of [-threshold, +threshold] and is applied to the input "
+            "of activations. No clip if not specified.",
+            AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("input_forget", "Couple the input and forget gates if 1, default 0.",
+            AttrType::AttributeProto_AttributeType_INT)
+        .Input("initial_c",
+            "Optional initial value of the cell. If not specified - assumed "
+            "to be 0. It has shape `[num_directions, batch_size, hidden_size]`.", "T", 
+            true /*optional*/)
+        .Input("P",
+            "The weight tensor for peepholes. Concatenation of `P[iof]` and "
+            "`PB[iof]` (if bidirectional) along dimension 0. It has shape "
+            "`[num_directions, 3*hidde_size]`. Optional: If not specified - "
+            "assumed to be 0.", "T",
+            true /*optional*/)
+        .FillUsing(RNNDocGeneratorAttrOutput());
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/tensor/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/tensor/defs.cpp
@ -1,162 +1,183 @@
-// Copyright (c) Facebook Inc. and Microsoft Corporation.
-// Licensed under the MIT license.
-
 #include "proto/onnx/core/op.h"

-namespace ONNXIR
-{
+namespace ONNXIR {
+    // Taken fron ONNX
    REGISTER_OPERATOR_SCHEMA(Cast)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-The operator casts the elements of a given input tensor to a data type
-specified by the 'to' argument and returns an output tensor of the same size in
-the converted type. The 'to' argument must be one of the data types specified
-in the 'DataType' enum field in the TensorProto message. If the 'to' argument
-is not provided or is not one of the enumerated types in DataType, Caffe2
-throws an Enforce error.
-
-NOTE: Casting to and from strings is not supported yet.
-)DOC")
+        .Description("The operator casts the elements of a given input tensor to a data type "
+            "specified by the 'to' argument and returns an output tensor of the same size in "
+            "the converted type. The 'to' argument must be one of the data types specified "
+            "in the 'DataType' enum field in the TensorProto message. If the 'to' argument "
+            "is not provided or is not one of the enumerated types in DataType, Caffe2 "
+            "throws an Enforce error. "
+            "NOTE: Casting to and from strings is not supported yet.")
+        .Input("input", "Input tensor to be cast.", "T")
+        .Output(
+            "output",
+            "Output tensor with the same shape as input with type "
+            "specified by the 'to' argument",
+            "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
        .Attr(
            "to",
            "The data type to which the elements of the input tensor are cast."
            "Strictly must be one of the types from DataType enum in TensorProto",
-    AttrType::STRING)
-        .Input(0, "input", "Input tensor to be cast.")
-        .Output(
-            0,
-            "output",
-            "Output tensor with the same shape as input with type "
-            "specified by the 'to' argument");
+            AttrType::AttributeProto_AttributeType_STRING);

+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Flatten)
+        .Description("Flattens the input tensor into a 2D matrix, "
+            "keeping the first dimension unchanged.")
+        .Input("input", "A tensor of rank >= 2.", "T")
+        .Output("output", "A tensor of rank 2 with the contents of the input tensor, "
+            "with first dimension equal first dimension of input, and remaining "
+            "input dimensions flatenned into the inner dimension of the output.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");
+
+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Reshape)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .AllowConsumed({ {0, 0} })
-        .SetDoc(R"DOC(
-Reshape the input tensor similar to numpy.reshape.
-    
-It takes a tensor as input and an argument `shape`. It outputs the reshaped tensor.
-    
-At most one dimension of the new shape can be -1. In this case, the value is
-inferred from the size of the tensor and the remaining dimensions. A dimension
-could also be 0, in which case the actual dimension value is going to be copied
-from the shape argument.)DOC")
-.Attr("shape", "New shape", AttrType::INTS)
-.Input(0, "data", "An input tensor.")
-.Output(0, "reshaped", "Reshaped data.");
-
-    REGISTER_OPERATOR_SCHEMA(Concat)
-        .NumInputs(1, INT_MAX)
-        .NumOutputs(2)
-        .Attr("axis",
-            "Which axis to concat on",
-            AttrType::INT)
-        .SetDoc("Concatenate a list of tensors into a single tensor")
-        .Output(0, "concat_result", "Concatenated tensor");
+        .Description("Reshape the input tensor similar to numpy.reshape. "
+            "                                                                                    "
+            "It takes a tensor as input and an argument `shape`. It outputs the reshaped tensor. "
+            "                                                                             "
+            "At most one dimension of the new shape can be -1. In this case, the value is "
+            "inferred from the size of the tensor and the remaining dimensions. A dimensions "
+            "could also be 0, in which case the actual dimension value is going to be copied "
+            "from the shape argument.")
+        .Input("data", "An input tensor.", "T")
+        .Output("reshaped", "Reshaped data.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("shape", "Tensor of shape declarations for the output. Must be compatible with "
+            "the input. At most one dimension of the new shape can be -1. In this case, the "
+            "value is inferred from the size of the tensor and the remaining dimensions. A "
+            "dimension could also be 0, in which case the actual dimension value is going to "
+            "be copied from the input tensor.", AttrType::AttributeProto_AttributeType_INTS);

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Split)
-        .NumInputs(1, 2)
-        .NumOutputs(1, INT_MAX)
-        .Input(0, "input", "The tensor to split")
-        .Input(1, "split", "Optional list of output lengths (see also arg 'split')")
-        .Attr("axis",
-            "Which axis to split on",
-            AttrType::INT)
-        .Attr("split",
-            "length of each output",
-            AttrType::INTS)
-        .SetDoc(R"DOC(Split a tensor into a list of tensors, along the specified
-'axis'. The lengths of the split can be specified using argument 'axis' or
-optional second input blob to the operator. Otherwise, the tensor is split
-to equal sized parts.
-)DOC");
-
-    REGISTER_OPERATOR_SCHEMA(Slice)
-        .NumInputs(1, 3)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Produces a slice of the input tensor along multiple axes. Similar to numpy:
-https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html 
-
-Slices are passed as two keyword argument lists with starting and end indices 
-for each dimension of the input `data` tensor. If a negative value is passed 
-for any of the start or end indices, it represent number of elements before 
-the end of that dimension.
-
-`strides` is the  step sizes when applying slicing, negative value means in 
-reverse order.
-)DOC")
-.Input(0, "data", "Tensor of data to extract slices from.")
-.Attr("starts",
-    "List of starting indices",
-    AttrType::INTS)
-        .Attr("ends",
-            "List of ending indices",
-            AttrType::INTS)
-        .Output(0, "output", "Sliced data tensor.");
+        .Description("Split a tensor into a list of tensors, along the specified 'axis'. "
+            "The lengths of the split can be specified using argument 'axis' or "
+            "optional second input blob to the operator. Otherwise, the tensor is split "
+            "to equal sized parts.")
+        .Input("input", "The tensor to split", "T")
+        .Input("split", "Optional list of output lengths (see also arg 'split')", "T")
+        .Output("output", "A list of output tensors", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "Which axis to split on", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("split", "Number of tensors to output.", AttrType::AttributeProto_AttributeType_INT);

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Transpose)
-        .NumInputs(1)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Transpose the input tensor similar to numpy.transpose. For example, when
-axes=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-will be (2, 1, 3).
-)DOC")
-.Attr("perm",
-    "A list of integers. By default, reverse the dimensions, "
-    "otherwise permute the axes according to the values given.",
-    AttrType::INTS)
-        .Input(0, "data", "An input tensor.")
-        .Output(0, "transposed", "Transposed output.");
+        .Description("Transpose the input tensor similar to numpy.transpose. For example, "
+            "when axes=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape "
+            "will be (2, 1, 3).")
+        .Input("data", "An input tensor.", "T")
+        .Output("transposed", "Transposed output.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("perm", "A list of integers. By default, reverse the dimensions, "
+            "otherwise permute the axes according to the values given.", AttrType::AttributeProto_AttributeType_INTS);

+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(Tile)
+        .Description("Repeat the elements of a tensor along an axis.")
+        .Input("input", "An input tensor.", "T")
+        .Output("output", "Repeated output.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "Axis along which to repeat. Default is 0.",
+            AttrType::AttributeProto_AttributeType_INT, int64_t(0))
+        .Attr("tiles", "Number of repeated copies to make of the input tensor.",
+            AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Concat)
+        .Description("Concatenate takes as input a list of tensors, all of the same shape"
+            "expect for the concatenation axis, and returns a single tensor, the concatenation"
+            "of all inputs.")
+        .Input("input", "A list of input tensors.", "T")
+        .Output("output", "Concatenated tensor", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("axis", "Axis along which to concatenate", AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from ONNX
+    REGISTER_OPERATOR_SCHEMA(Slice)
+        .Description("Produces a slice of the input tensor along multiple axes. Similar to "
+            "numpy: https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html "
+            "                                                                              "
+            "Slices are passed as two keyword argument lists with starting and end indices "
+            "for each dimension of the input `data` tensor. If a negative value is passed "
+            "for any of the start or end indices, it represent number of elements before "
+            "the end of that dimension. "
+            "                                                                            "
+            "`strides` is the  step sizes when applying slicing, negative value means in "
+            "reverse order.")
+        .Input("input", "Tensor of data to extract slices from.", "T")
+        .Output("output", "Sliced data tensor.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("starts", "List of starting indices", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("ends", "List of ending indices", AttrType::AttributeProto_AttributeType_INTS);
+
+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Gather)
-        .NumInputs(2)
-        .NumOutputs(1)
-        .SetDoc(R"DOC(
-Given DATA tensor of rank r >= 1, and INDICES tensor of rank q, gather
-entries of the outer-most dimension of DATA indexed by INDICES, and concatenate
-them in an output tensor of rank q + (r - 1).
-
-Example:
-  DATA  = [
-      [1.0, 1.2],
-      [2.3, 3.4],
-      [4.5, 5.7],
-  ]
-  INDICES = [
-      [0, 1],
-      [1, 2],
-  ]
-  OUTPUT = [
-      [
-          [1.0, 1.2],
-          [2.3, 3.4],
-      ],
-      [
-          [2.3, 3.4],
-          [4.5, 5.7],
-      ],
-  ]
-)DOC")
-.Input(0, "DATA", "Tensor of rank r >= 1.")
-.Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
-.Output(0, "OUTPUT", "Tensor of rank q + (r - 1).");
+        .Description("Given data tensor of rank r >= 1, and indices tensor of rank q, gather "
+            "entries of the outer-most dimension of data indexed by indices, and concatenate "
+            "them in an output tensor of rank q + (r - 1). "
+            "Example: data = [ [1.0, 1.2], [2.3, 3.4], [4.5, 5.7] ] "
+            "indices = [ [0, 1], [1, 2] ] "
+            "ouput = [ [ [1.0, 1.2], [2.3, 3.4], ], [ [2.3, 3.4], [4.5, 5.7] ] ] ")
+        .Input("data", "Tensor of rank r >= 1.", "T")
+        .Input("indices", "Tensor of int32/int64 indices, of any rank q.", "T")
+        .Output("ouput", "Tensor of rank q + (r - 1).", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.");

+    // Taken from ONNX
    REGISTER_OPERATOR_SCHEMA(Squeeze)
-        .NumInputs(1)
-        .NumOutputs(1)
+        .Description("Remove single-dimensional entries from the shape of a tensor. "
+            "Takes a  parameter `axes` with a list of axes to squeeze.")
+        .Input("data", "Tensors with at least max(dims) dimensions.", "T")
+        .Output("squeezed", "Reshaped tensor with same data as input.", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
        .Attr("axes",
            "List of positive integers, indicate the dimensions to squeeze.",
-            AttrType::INTS,
-            true)
-        .SetDoc(R"DOC(
-Remove single-dimensional entries from the shape of a tensor.
-Takes a  parameter `axes` with a list of axes to squeeze.
-)DOC")
-.Input(0, "data", "Tensors with at least max(dims) dimensions.")
-.Output(0, "squeezed", "Reshaped tensor with same data as input.");
+            AttrType::AttributeProto_AttributeType_INTS, int64_t(1));
+
+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(BatchToSpace)
+        .Description("BatchToSpace for 4-D tensors of type T. "
+            "Rearranges (permutes) data from batch into blocks of spatial data, "
+            "followed by cropping. This is the reverse transformation of "
+            "SpaceToBatch. More specifically, this op outputs a copy of the input "
+            "tensor where values from the batch dimension are moved in spatial "
+            "blocks to the height and width dimensions, followed by cropping along "
+            "the height and width dimensions.")
+        .Input("input", "Input tensor of [N,C,H,W]", "T")
+        .Output("output", "Output tensor of [N, C/(blocksize * blocksize), H * blocksize, "
+            "W * blocksize]", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("blocksize", "Blocks of [blocksize,blocksize] are moved.", AttrType::AttributeProto_AttributeType_INT);
+
+    // Taken from Caffe2
+    REGISTER_OPERATOR_SCHEMA(SpaceToBatch)
+        .Description("SpaceToBatch for 4-D tensors of type T. "
+            "Zero-pads and then rearranges (permutes) blocks of spatial data into "
+            "batch. More specifically, this op outputs a copy of the input tensor "
+            "where values from the height and width dimensions are moved to the "
+            "batch dimension. After the zero-padding, both height and width of the "
+            "input must be divisible by the block size.")
+        .Input("input", "Input tensor of [N,C,H,W]", "T")
+        .Output("output", "Output tensor of [N, C * blocksize * blocksize, H/blocksize, "
+            "W/blocksize]", "T")
+        .TypeConstraint("T", { "tensor(float16)", "tensor(float)", "tensor(double)" },
+            "Constrain input and output types to float tensors.")
+        .Attr("blocksize", "Blocks of [blocksize,blocksize] are moved.", AttrType::AttributeProto_AttributeType_INT);

 }
--- a/Source/CNTKv2LibraryDll/proto/onnx/defs/traditionalml/defs.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/defs/traditionalml/defs.cpp
@ -0,0 +1,324 @@
+#include "proto/onnx/core/op.h"
+
+namespace ONNXIR {
+
+    REGISTER_OPERATOR_SCHEMA(ArrayFeatureExtractor)
+        .Input("X", "Data to be selected from", "T1")
+        .Input("Y", "Data to be selected from", "T2")
+        .Output("Z", "Selected data as an array", "T1")
+        .Description(R"DOC(
+            Select a subset of the data from input1 based on the indices provided in input2.
+            )DOC")
+        .TypeConstraint("T1", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)", "tensor(string)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(int64)" }, " Index value types .");
+
+
+    REGISTER_OPERATOR_SCHEMA(Binarizer)
+        .Input("X", "Data to be binarized", "T")
+        .Output("Y", "Binarized output data", "T")
+        .Description(R"DOC(
+            Makes values 1 or 0 based on a single threshold.
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("threshold", "Values greater than this are set to 1, else set to 0", AttrType::AttributeProto_AttributeType_FLOAT);
+
+    REGISTER_OPERATOR_SCHEMA(CastMap)
+        .Input("X", "The input values", "T1")
+        .Output("Y", "The output values", "T2")
+        .Description(R"DOC(
+            Casts the input into an output tensor.
+            )DOC")
+        .TypeConstraint("T1", { "map(int64, string)", "map(int64, float)" }, " allowed input types.")
+        .TypeConstraint("T2", { "tensor(string)","tensor(float)","tensor(int64)" }, " allowed output types.")
+        .Attr("cast_to", "what type of tensor to cast the input to, enum 'TO_FLOAT','TO_STRING','TO_INT64'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("map_form", "if casting from a map with int64 keys, should we pad spaces between the keys or pack them, enum 'PACK, 'SPARSE'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("max_map", "if casting from a sparse map, what is the max key in the map", AttrType::AttributeProto_AttributeType_INT);
+
+    REGISTER_OPERATOR_SCHEMA(CategoryMapper)
+        .Input("X", "Input data", "T1")
+        .Output("Y", "Output data, if strings are input, then output is INTS, and vice versa.", "T2")
+        .Description(R"DOC(
+            Convert strings to INTS and vice versa.
+            Takes in a map to use for the conversion.
+            The index position in the strings and ints repeated inputs
+             is used to do the mapping.
+            Each instantiated operator converts either ints to strings or strings to ints.
+            This behavior is triggered based on which default value is set.
+            If the string default value is set, it will convert ints to strings.
+            If the int default value is set, it will convert strings to ints.
+            )DOC")
+        .TypeConstraint("T1", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("cats_strings", "strings part of the input map, must be same size as the ints", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("cats_int64s", "ints part of the input map, must be same size and the strings", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("default_string", "string value to use if the int is not in the map", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("default_int64", "int value to use if the string is not in the map", AttrType::AttributeProto_AttributeType_INT);
+
+
+    REGISTER_OPERATOR_SCHEMA(DictVectorizer)
+        .Input("X", "The input dictionary", "T")
+        .Output("Y", "The tensor", "tensor(int64)")
+        .Description(R"DOC(
+            Uses an index mapping to convert a dictionary to an array.
+            The output array will be equal in length to the index mapping vector parameter.
+            All keys in the input dictionary must be present in the index mapping vector.
+            For each item in the input dictionary, insert its value in the ouput array.
+            The position of the insertion is determined by the position of the item's key
+            in the index mapping. Any keys not present in the input dictionary, will be
+            zero in the output array.  Use either string_vocabulary or int64_vocabulary, not both.
+            For example: if the ``string_vocabulary`` parameter is set to ``["a", "c", "b", "z"]``,
+            then an input of ``{"a": 4, "c": 8}`` will produce an output of ``[4, 8, 0, 0]``.
+            )DOC")
+        .TypeConstraint("T", { "map(string, int64)", "map(int64, string)" }, " allowed types.")
+        .Attr("string_vocabulary", "The vocabulary vector of strings", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("int64_vocabulary", "The vocabulary vector of int64s", AttrType::AttributeProto_AttributeType_INTS);
+
+
+    REGISTER_OPERATOR_SCHEMA(Imputer)
+        .Input("X", "Data to be imputed", "T")
+        .Output("Y", "Imputed output data", "T")
+        .Description(R"DOC(
+            Replace imputs that equal replaceValue/s  with  imputeValue/s.
+            All other inputs are copied to the output unchanged.
+            This op is used to replace missing values where we know what a missing value looks like.
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("imputed_value_floats", "value(s) to change to, can be length 1 or length F if using int type", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("replaced_value_float", "value that needs replacing if using int type", AttrType::AttributeProto_AttributeType_FLOAT)
+        .Attr("imputed_value_int64s", "value(s) to change to, can be length 1 or length F if using int type", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("replaced_value_int64", "value that needs replacing if using int type", AttrType::AttributeProto_AttributeType_INT);
+
+
+    REGISTER_OPERATOR_SCHEMA(FeatureVectorizer)
+        .Input("X", "ordered input tensors", "T")
+        .Output("Y", "flattened feature vectors.", "T")
+        .Description(R"DOC(
+            Concatenates a list of input tensors of floats into one tensor.
+            Input order in inputs must match inputlist and inputdimensions order.
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)" }, " allowed types.")
+        .Attr("inputlist", "list of string names of the input features, output features will appear in this order", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("inputdimensions", "the size of the inputs in the input list", AttrType::AttributeProto_AttributeType_INTS);
+
+
+    REGISTER_OPERATOR_SCHEMA(LabelEncoder)
+        .Input("X", "Data to be encoded", "T1")
+        .Output("Y", "Encoded output data", "T2")
+        .Description(R"DOC(
+            Convert class label to their integral type and vice versa.
+            In both cases the operator is instantiated with the list of class strings.
+            The integral value of the string is the index position in the list.
+            )DOC")
+        .TypeConstraint("T1", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("classes_strings", "List of class label strings to be encoded as INTS", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("default_int64", "Default value if not in class list as int64", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("default_string", "Default value if not in class list as string", AttrType::AttributeProto_AttributeType_STRING);
+
+
+    REGISTER_OPERATOR_SCHEMA(LinearClassifier)
+        .Input("X", "Data to be classified", "T1")
+        .Output("Y", "Classification outputs (one class per example", "T2")
+        .Output("Z", "Classification outputs (All classes scores per example,N,E", "tensor(float)")
+        .Description(R"DOC(
+            Linear classifier prediction (choose class)
+            )DOC")
+        .TypeConstraint("T1", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("coefficients", "weights of the model(s)", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("intercepts", "weights of the intercepts (if used)", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("multi_class", "whether to do OvR or multinomial (0=OvR and is default)", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("classlabels_strings", "class labels if using string labels, size E", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("classlabels_int64s", "class labels if using int labels, size E", AttrType::AttributeProto_AttributeType_INTS);
+
+
+    REGISTER_OPERATOR_SCHEMA(LinearRegressor)
+        .Input("X", "Data to be regressed", "T")
+        .Output("Y", "Regression outputs (one per target, per example", "tensor(float)")
+        .Description(R"DOC(
+            Generalized linear regression evaluation.
+            If targets is set to 1 (default) then univariate regression is performed.
+            If targets is set to M then M sets of coefficients must be passed in as a sequence
+            and M results will be output for each input n in N.
+            Coefficients are of the same length as an n, and coefficents for each target are contiguous.
+           "Intercepts are optional but if provided must match the number of targets.
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("coefficients", "weights of the model(s)", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("intercepts", "weights of the intercepts (if used)", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("targets", "total number of regression targets (default is 1)", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING);
+
+
+    REGISTER_OPERATOR_SCHEMA(Normalizer)
+        .Input("X", "Data to be encoded", "T")
+        .Output("Y", "encoded output data", "tensor(float)")
+        .Description(R"DOC(
+            Normalize the input.  There are three normalization modes,
+            which have the corresponding formulas:
+            Max .. math::     max(x_i)
+            L1  .. math::  z = ||x||_1 = \sum_{i=1}^{n} |x_i|
+            L2  .. math::  z = ||x||_2 = \sqrt{\sum_{i=1}^{n} x_i^2}
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("norm", "enum 'MAX', 'L1', 'L2'", AttrType::AttributeProto_AttributeType_STRING);
+
+
+    REGISTER_OPERATOR_SCHEMA(OneHotEncoder)
+        .Input("X", "Data to be encoded", "T")
+        .Output("Y", "encoded output data", "tensor(float)")
+        .Description(R"DOC(
+            Replace the inputs with an array of ones and zeros, where the only
+            one is the zero-based category that was passed in.  The total category count 
+            will determine the length of the vector. For example if we pass a 
+            tensor with a single value of 4, and a category count of 8, the 
+            output will be a tensor with 0,0,0,0,1,0,0,0 .
+            This operator assumes every input in X is of the same category set 
+            (meaning there is only one category count).
+            )DOC")
+        .TypeConstraint("T", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("cats_int64s", "list of cateogries, ints", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("cats_strings", "list of cateogries, strings", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("zeros", "if true and category is not present, will return all zeros, if false and missing category, operator will return false", AttrType::AttributeProto_AttributeType_INT);
+
+
+    // Input: X, output: Y
+    REGISTER_OPERATOR_SCHEMA(Scaler)
+        .Input("X", "Data to be scaled", "T")
+        .Output("Y", "Scaled output data", "tensor(float)")
+        .Description(R"DOC(
+            Rescale input data, for example to standardize features by removing the mean and scaling to unit variance.
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("scale", "second, multiply by this", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("offset", "first, offset by thisfirst, offset by this, can be one value or a separate value for each feature", AttrType::AttributeProto_AttributeType_FLOATS);
+
+    REGISTER_OPERATOR_SCHEMA(SVMClassifier)
+        .Input("X", "Data to be classified", "T1")
+        .Output("Y", "Classification outputs, one class per example", "T2")
+        .Output("Z", "Classification outputs, All classes scores per example,N,E*(E-1)/2 if dual scores, or E if probabilities are used.", "tensor(float)")
+        .Description(R"DOC(
+            SVM classifier prediction (two class or multiclass).
+            Will output probabilities in Z if prob_A and prob_B are filled in.
+            )DOC")
+        .TypeConstraint("T1", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("kernel_type", "enum 'LINEAR', 'POLY', 'RBF', 'SIGMOID', defaults to linear", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_params", "Tensor of 3 elements containing gamma, coef0, degree in that order.  Zero if unused for the kernel.", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("prob_a", "probability vector a, must be either 0 length or E*(E-1)/2 length", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("prob_b", "probability vector b, must be same length as prob_a", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("vectors_per_class", "", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("support_vectors", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("coefficients", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("rho", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("classlabels_strings", "class labels if using string labels", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("classlabels_int64s", "class labels if using int labels", AttrType::AttributeProto_AttributeType_INTS);
+
+
+    REGISTER_OPERATOR_SCHEMA(SVMRegressor)
+        .Input("X", "Input N,F", "T")
+        .Output("Y", "All target scores, N,E", "tensor(float)")
+        .Description(R"DOC(
+            SVM regressor. Also supports oneclass svm. 
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("kernel_type", "enum 'LINEAR', 'POLY', 'RBF', 'SIGMOID', defaults to linear", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("kernel_params", "Tensor of 3 elements containing gamma, coef0, degree in that order.  Zero if unused for the kernel.", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("vectors_per_class", "", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("support_vectors", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("n_supports", "number of support vectors", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("coefficients", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("rho", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("one_class", "If this regressor is a oneclass svm set this param to 1, otherwise use 0 (default is zero)", AttrType::AttributeProto_AttributeType_INT);
+
+    REGISTER_OPERATOR_SCHEMA(TreeEnsembleClassifier)
+        .Input("X", "Data to be classified", "T1")
+        .Output("Y", "Classification outputs (one class per example", "T2")
+        .Output("Z", "Classification outputs (All classes scores per example,N,E", "tensor(float)")
+        .Description(R"DOC(
+            Tree Ensemble classifier.  Returns the top class for each input in N.
+            All args with nodes_ are fields of a tuple of tree nodes, and 
+            it is assumed they are the same length, and an index i will decode the
+            tuple across these inputs.  Each node id can appear only once 
+            for each tree id."
+            All fields prefixed with class_ are tuples of votes at the leaves.
+            A leaf may have multiple votes, where each vote is weighted by
+            the associated class_weights index.  
+            It is expected that either classlabels_strings or classlabels_INTS
+            will be passed and the class_ids are an index into this list.
+            Mode enum is BRANCH_LEQ, BRANCH_LT, BRANCH_GTE, BRANCH_GT, BRANCH_EQ, BRANCH_NEQ, LEAF.
+            )DOC")
+        .TypeConstraint("T1", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .TypeConstraint("T2", { "tensor(string)", "tensor(int64)" }, " allowed types.")
+        .Attr("nodes_treeids", "tree id for this node", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_nodeids", "node id for this node, node ids may restart at zero for each tree (but not required).", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_featureids", "feature id for this node", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_values", "thresholds to do the splitting on for this node.", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("nodes_hitrates", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("nodes_modes", "enum of behavior for this node 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("nodes_truenodeids", "child node if expression is true", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_falsenodeids", "child node if expression is false", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_missing_value_tracks_true", "for each node, decide if the value is missing (nan) then use true branch, this field can be left unset and will assume false for all nodes", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("base_values", "starting values for each class, can be omitted and will be assumed as 0", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("class_treeids", "tree that this node is in", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("class_nodeids", "node id that this weight is for", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("class_ids", "index of the class list that this weight is for", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("class_weights", "the weight for the class in class_id", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("classlabels_strings", "class labels if using string labels, size E", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("classlabels_int64s", "class labels if using int labels, size E, one of the two class label fields must be used", AttrType::AttributeProto_AttributeType_INTS);
+
+
+    REGISTER_OPERATOR_SCHEMA(TreeEnsembleRegressor)
+        .Input("X", "Input N,F", "T")
+        .Output("Y", "NxE floats", "tensor(float)")
+        .Description(R"DOC(
+            Tree Ensemble regressor.  Returns the regressed values for each input in N.
+            All args with nodes_ are fields of a tuple of tree nodes, and 
+            it is assumed they are the same length, and an index i will decode the
+            tuple across these inputs.  Each node id can appear only once 
+            for each tree id.
+            All fields prefixed with target_ are tuples of votes at the leaves.
+            A leaf may have multiple votes, where each vote is weighted by
+            the associated target_weights index.  
+            All trees must have their node ids start at 0 and increment by 1.
+            Mode enum is BRANCH_LEQ, BRANCH_LT, BRANCH_GTE, BRANCH_GT, BRANCH_EQ, BRANCH_NEQ, LEAF
+            )DOC")
+        .TypeConstraint("T", { "tensor(float)", "tensor(double)", "tensor(int64)", "tensor(int32)" }, " allowed types.")
+        .Attr("nodes_treeids", "tree id for this node", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_nodeids", "node id for this node, node ids may restart at zero for each tree (but not required).", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_featureids", "feature id for this node", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_values", "thresholds to do the splitting on for this node.", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("nodes_hitrates", "", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("nodes_modes", "enum of behavior for this node.  enum 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("nodes_truenodeids", "child node if expression is true", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_falsenodeids", "child node if expression is false", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("nodes_missing_value_tracks_true", "for each node, decide if the value is missing (nan) then use true branch, this field can be left unset and will assume false for all nodes", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("target_treeids", "tree that this node is in", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("target_nodeids", "node id that this weight is for", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("target_ids", "index of the class list that this weight is for", AttrType::AttributeProto_AttributeType_INTS)
+        .Attr("target_weights", "the weight for the class in target_id", AttrType::AttributeProto_AttributeType_FLOATS)
+        .Attr("n_targets", "number of regression targets", AttrType::AttributeProto_AttributeType_INT)
+        .Attr("post_transform", "post eval transform for score, enum 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', 'PROBIT'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("aggregate_function", "post eval transform for score,  enum 'AVERAGE', 'SUM', 'MIN', 'MAX'", AttrType::AttributeProto_AttributeType_STRING)
+        .Attr("base_values", "base values for regression, added to final score, size must be the same as n_outputs or can be left unassigned (assumed 0)", AttrType::AttributeProto_AttributeType_FLOATS);
+
+    REGISTER_OPERATOR_SCHEMA(VecDictionizer)
+        .Input("X", "The input values", "tensor(float)")
+        .Output("Y", "The output map", "T")
+        .Description(R"DOC(
+            Makes a map from the input and the attributes.  
+            Assumes input 0 are the values, and the keys are specified by the attributes.
+            Must provide keys in either classlabels_strings or classlabels_int64s (but not both).
+            Input 0 may have a batch size larger than 1, 
+            but each input in the batch must be the size of the keys specified by the attributes.
+            The order of the input and attributes determines the key-value mapping.
+            )DOC")
+        .TypeConstraint("T", { "map(string, float)", "map(int64, float)" }, " allowed types.")
+        .Attr("classlabels_strings", "keys if using string keys", AttrType::AttributeProto_AttributeType_STRINGS)
+        .Attr("classlabels_int64s", "keys if using int keys", AttrType::AttributeProto_AttributeType_INTS);
+
+}
--- a/Source/CNTKv2LibraryDll/proto/onnx/protobuf/graph.proto
+++ b/Source/CNTKv2LibraryDll/proto/onnx/protobuf/graph.proto
@ -17,7 +17,7 @@ package ONNXIR;

 // Note [Namespaces]
 // ~~~~~~~~~~~~~~~~~
-// LotusIR gives explicit names to graphs, intermediate values and
+// ONNXIR gives explicit names to graphs, intermediate values and
 // serialized tensors.  To make it easier to generate names, we organize
 // these into separate namespaces (so, e.g., a graph can have the same
 // name as a serialized tensor.)  The namespaces are as follows:
@ -33,13 +33,17 @@ package ONNXIR;
 //   the computation of a graph.
 // - Shape: These names represent parameters for unknown shape dimensions.
 //
-// We specify the namespace of a name in LotusIR as comments in the form
+// We specify the namespace of a name in ONNXIR as comments in the form
 // of "namespace {Node,Graph,OperatorOrFunction,Attribute,Value,Shape}". Framework is
 // responsible for supporting the namespaces.

 // To be compatible with both proto2 and proto3, we will use a version number
 // that is not defined by the default value but an explicit enum number.
 enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+
  // The version field is always serialized and we will use it to store the
  // version that the  graph is generated from. This helps us set up version
  // control. We should use version as
@ -53,8 +57,36 @@ enum Version {
 // An AttributeProto MUST contain the name field, and *only one* of the
 // following content fields, effectively enforcing a C/C++ union equivalent.
 message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
  // The name field MUST be present for this version of the IR.
  optional string name = 1;           // namespace Attribute
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
  optional float f = 2;               // float
  optional int64 i = 3;               // int
  optional bytes s = 4;               // UTF-8 string
@ -66,13 +98,6 @@ message AttributeProto {
  repeated bytes strings = 9;         // list of UTF-8 strings
  repeated TensorProto tensors = 10;  // list of tensors
  repeated GraphProto graphs = 11;    // list of graph
-
-  optional TypeProto type = 51;
-  repeated TypeProto types = 52;
-  //ISSUE:13807134,dbox: Do we ever see shape showing up as an attribute value?
-  //            If so, won't it always be accompanied by a TypeProto?
-  optional TypeProto.TensorShapeProto shape = 53;
-  repeated TypeProto.TensorShapeProto shapes = 54;
 }

 // Defines information on value, including the name, the type, and
@ -131,20 +156,7 @@ message NodeProto {
  // This text MAY contain Markdown markup that conforms to http://commonmark.org/.
  optional string doc_string = 6;

-  // The number of inputs for each argument of the operator/function.
-  // A formal parameter of the op may take a variable number of inputs
-  // that is only known when this node is constructed.
-  //BUG:13806939,dbox: I'm assuming that this field is like input_arg_info in that
-  //          a zero element/missing array implies that one needs to crawl
-  //          the graph to figure out the input counts, yes? Confirm and I'll
-  //          make clear.  Otherwise, we need to require it to be present
-  //          and accurate.
-  repeated int32 input_arg_count = 50;
-
-  // Specify a list of named nodes that must be executed before this node.
-  // Framework may use this to give users the ability to impose additional
-  // execution orders for the operations.
-  repeated string control_input = 51;
+  reserved 100 to 200; // for future extensions.
 }

 // ModelProto is a top-level file/container format for bundling a ML model.
@ -182,15 +194,6 @@ message ModelProto {

  // The parameterized graph that is evaluated to execute the model.
  optional GraphProto graph = 7;
-
-  // NOTE: ids between 8 and 49 are reserved for more ONNX fields.
-  
-  // The optional name of the author who created the graph.
-  optional string model_author = 50;
-
-  // Optional licensing information concerning use or origination of the graph.
-  // This text MAY contain Markdown markup that conforms to http://commonmark.org/. 
-  optional string model_license = 51;
 };

 // GraphProto defines a parameterized series of nodes to form a directed acyclic graph.
@ -233,24 +236,6 @@ message GraphProto {
  // optional string producer_tag = 8;
  // optional string domain = 9;

-  // The function definitions of the graph. They can only only be used
-  // (i.e., called) in this graph.
-  // Each FunctionDefProto in function MUST have a unique name. 
-  repeated FunctionDefProto function = 50;
-
-  // The externally defined operators declared by this graph. 
-  repeated OperatorDeclProto operator = 51;
-
-  // TODO: When the map type is added, provide for the "model_information"
-  // field which holds name/value pairs of strings with additional devops
-  // metadata, such as an identifier for which training set this instance
-  // of a graph was trained with.
-
-  // Imported libraries are referenced as a collection of strings in the form of absolute
-  // URIs or relative paths. Where such relative paths are rooted is defined by tools and
-  // runtime implementations.
-  repeated string imported_libraries = 52;  
-  
  reserved 100 to 200; // for future extensions.
 }

@ -374,6 +359,87 @@ message SparseTensorProto {
  optional TensorProto values = 3;
 }

+// A ValueProto represents a value that may be serialized into a model.
+message ValueProto {
+
+  message KeyValuePairProto {
+    oneof key {
+      string s = 1;
+      int32 i32 = 2;
+      int64 i64 = 3;
+      uint64 ui64 = 4;
+    };
+    optional ValueProto value = 100;
+  }
+
+  message NameValuePairProto {
+    optional string key = 1;
+    optional ValueProto value = 100;
+  }
+
+  // Defines a record in its serialized format.
+  // A record is a sequence of one or more
+  // typed uniquely named values, which may be of different types.
+  message RecordProto {
+    repeated NameValuePairProto fields = 1;
+  }
+
+  // Defines a union value  in its serialized format.
+  // The name is the discriminator from the union type.
+  message UnionProto {
+    optional NameValuePairProto choice = 1;
+  }
+
+  // Defines a map in its serialized format.
+  // A record is a sequence of zero or more
+  // key/value pairs all of which have unique
+  // keys
+  message MapProto {
+    repeated KeyValuePairProto key_value_pairs = 1;
+  }
+
+  // Alternate space-efficient encoding of map that MAY be used
+  // for maps whose value type is a scalar.
+  message ScalarMapProto {
+    // keys.DataType must be an integral type or string
+    repeated TensorProto keys = 1;
+
+    // No restriction on data type, keys.length must equal values.length
+    repeated TensorProto values = 2;
+  }
+
+  // Defines a sequence in its serialized format.
+  // A sequence is a list of zero or more
+  // tensors, maps, records, or subsequences.
+  message SequenceProto {
+    repeated ValueProto elems = 1;
+  }
+
+  oneof value {
+    // A dense tensor (or scalar).
+    TensorProto dense_tensor = 1;
+
+    // A sparse tensor.
+    SparseTensorProto sparse_tensor = 2;
+
+    // A tuple.
+    RecordProto record = 3;
+
+    // A sequence.
+    SequenceProto seq = 4;
+
+    // A union.
+    UnionProto union = 5;
+
+    // A map.
+    MapProto map = 6;
+
+    // A map.
+    ScalarMapProto scalar_map = 7;
+  }
+
+  reserved 100 to 200; // for future extensions.
+}
 // Define the types.
 message TypeProto {
  // Defines a tensor shape. A dimension can be either an integer value
@ -384,8 +450,8 @@ message TypeProto {
      oneof value {
        int64 dim_value = 1;
        string dim_param = 2;   // namespace Shape
-      }
-    }
+      };
+    };
    repeated Dimension dim = 1;
  }

@ -403,26 +469,39 @@ message TypeProto {
    optional TensorShapeProto shape = 2;
  }

-  message HandleTypeProto {
-  }
+  // message T { ... }
+  message RecordTypeProto {
+    // The type and optional shape of each field
+    // is described by a ValueInfoProto. The field
+    // names must be unique.
+    // This field MUST be present for this version of the IR.
+    repeated ValueInfoProto field = 1;
+  };

-  message TupleTypeProto {
-    repeated TypeProto elem_type = 1;
-  }
-
-  message SeqTypeProto {
+  // repeated T
+  message SequenceTypeProto {
+    // The type and optional shape of each element of the sequence.
    // This field MUST be present for this version of the IR.
    optional TypeProto elem_type = 1;
-  }
+  };

+  // map<K,V>
  message MapTypeProto {
    // This field MUST be present for this version of the IR.
    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
    optional TensorProto.DataType key_type = 1;
    // This field MUST be present for this version of the IR.
-    // This field MUST NOT refer to UNDEFINED
-    optional TensorProto.DataType value_type = 2;
-  }
+    optional TypeProto value_type = 2;
+  };
+
+  // oneof { ... }
+  message UnionTypeProto {
+    // The type and optional shape of each alternative
+    // is described by a ValueInfoProto. The alternative
+    // names must be unique.
+    // This field MUST be present for this version of the IR.
+    repeated ValueInfoProto choice = 1;
+  };

  oneof value {
    // The type of a tensor.
@ -431,75 +510,22 @@ message TypeProto {
    // The type of a sparse tensor.
    SparseTensorTypeProto sparse_tensor_type = 2;

-    // The type of an opaque handle. A handle is used to represent a
-    // reference to a resource managed by the framework runtime.
-    HandleTypeProto handle_type = 3;
+    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
+    //        as input and output to graphs and nodes. These types are needed to naturally
+    //        support classical ML operators.  DNN operators SHOULD restrict their input
+    //        and output types to tensors.

-    // The type of a tuple.
-    TupleTypeProto tuple_type = 4;
+    // The type of a record.
+    RecordTypeProto record_type = 3;

    // The type of a sequence.
-    SeqTypeProto seq_type = 5;
+    SequenceTypeProto sequence_type = 4;

    // The type of a map.
-    MapTypeProto map_type = 6;
-  }
-}
+    MapTypeProto map_type = 5;

-message ValueProto {
-  // Defines a handle in its serialized format.
-  message HandleProto {
-    // This field MUST be present this version of the IR.  
-    optional int64 uid = 1;
-
-    // More information to be added. We need to specify the device
-    // that the resource managed by the handle is on.
-  }
-
-  // Defines a tuple in its serialized format.
-  message TupleProto {
-    repeated ValueProto elems = 1;
-  }
-
-  // Defines a sequence in its serialized format.
-  message SequenceProto {
-    repeated ValueProto elems = 1;
-  }
-
-  // Defines a map in its serialized format.
-  // Maps are serialized as two single-dimensional tensors 
-  // for storage efficiency. The dimensions of each tensor MUST be identical
-  // and the key at position N corresponds to the value at position N.
-  // Keys SHOULD be unique. When a given key appears multiple times,
-  // the value that corresponds last occurance of the key is the value.
-  // This is consistent with protobuf3 encoding rules for map.
-  message MapProto {
-    // This field MUST be present for this version of the IR.
-    // The data type of the tensor MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
-    optional TensorProto keys = 1;
-
-    // This field MUST be present for this version of the IR.
-    optional TensorProto values = 2;
-  }
-
-  oneof value {
-    // A dense tensor.
-    TensorProto dense_tensor = 1;
-
-    // A sparse tensor.
-    SparseTensorProto sparse_tensor = 2;  
-
-    // A handle.
-    HandleProto handle = 3;
-
-    // A tuple.
-    TupleProto tuple = 4;
-
-    // A sequence.
-    SequenceProto seq = 5;
-
-    // A map.
-    MapProto map = 6;
+    // The type of a union.
+    UnionTypeProto union_type = 6;
  }
 }

@ -527,6 +553,12 @@ message FunctionDefProto {

  // The named attributes of the function.
  repeated AttributeProto attr = 5;
+
+  // An optional human-readable documentation for this node in the graph.
+  // This text MAY contain Markdown markup that conforms to http://commonmark.org/.
+  optional string doc_string = 6;
+
+  reserved 100 to 200; // for future extensions.
 }

 message SignatureDeclProto {
@ -551,6 +583,8 @@ message OperatorDeclProto {

  // An optional human-readable documentation for this operator.
  optional string doc_string = 3;
+
+  reserved 100 to 200; // for future extensions.
 }

 // A library is a top-level format that contains the declaration
@ -610,4 +644,6 @@ message LibraryProto {
  // URIs or relative paths. Where such relative paths are rooted is defined by tools and
  // runtime implementations.
  repeated string imported_libraries = 12;
+
+  reserved 100 to 200; // for future extensions.
 }
--- a/bindings/csharp/CNTKLibraryManagedDll/ShimApiClasses/FunctionShim.cs
+++ b/bindings/csharp/CNTKLibraryManagedDll/ShimApiClasses/FunctionShim.cs
@ -262,9 +262,9 @@ namespace CNTK
        /// <param name="filepath"></param>
        /// <param name="computeDevice"></param>
        /// <returns></returns>
-        public static Function Load(string filepath, DeviceDescriptor computeDevice)
+        public static Function Load(string filepath, DeviceDescriptor computeDevice, ModelFormat format = ModelFormat.CNTKv2)
        {
-            return _Load(filepath, computeDevice);
+            return _Load(filepath, computeDevice, format);
        }

        /// <summary>