CNTK splice allows broadcast. This case is handled in the change.

2018-08-26 08:41:20 -07:00 · 2018-08-26 08:41:20 -07:00 · 0e208365be
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@ -944,6 +944,7 @@ namespace CNTK
                    for (size_t i = 0; i < replacementShape.Rank(); ++i)
                    {
                        if (replacementShape[i] == NDShape::InferredDimension)
+                        // TODO: shall NDShape::FreeDimension be considered here instead?
                            replacementShape[i] = 0;
                    }

--- a/Source/CNTKv2LibraryDll/proto/onnx/CNTKToONNX.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/CNTKToONNX.cpp
@ -63,6 +63,11 @@ private:
        std::unordered_map<Variable, LotusIR::Node*>& variableNodes,
        const std::unordered_map<Variable, Variable>& compositeOutputsMap);

+    // Create an ONNX NodeArg of desired shape with constant 0s as initial values. 
+    // The NodeArg is used to expand inputs of a CNTK splice op to a desired shape via broadcast.
+    static LotusIR::NodeArg &AddZerosConstantNodeArg(Graph *graph, const string &nodeArgName,
+        const std::vector<int64_t> &shape, CNTK::DataType dataType);
+
    static LotusIR::Node *AddReshapeNodeAccordingToONNXVersion(Graph *graph, const string &nodeName, NodeArg *input, NodeArg *output, const std::vector<int64_t>& newShape);


@ -90,9 +95,13 @@ private:
        LotusIR::Graph* graph);
    static LotusIR::Node *AddMatMulNode(LotusIR::NodeArg &nodeArg1, LotusIR::NodeArg &nodeArg2, LotusIR::Graph* graph,
        const std::string &out_arg_name);
+    static LotusIR::Node *AddAddNode(LotusIR::NodeArg &nodeArg1, LotusIR::NodeArg &nodeArg2, LotusIR::Graph* graph,
+        const std::string &out_arg_name);
    static LotusIR::Node *AddArgMaxNode(LotusIR::NodeArg &nodeArg, LotusIR::Graph* graph, int axis);
    static LotusIR::Node *AddCastNode(LotusIR::NodeArg &nodeArg, LotusIR::Graph* graph, const std::string &toType);

+    static void BroadcastInputsIfNeeded(std::vector<LotusIR::NodeArg *> &orderedInputs, const FunctionPtr& src, LotusIR::Graph* graph);
+
    //
    //  Insert a reshape node in front of a given node and its output node arg
    //
@ -151,6 +160,8 @@ private:
                              std::set<FunctionPtr>& visited,
                              std::unordered_map<Variable, Variable>& compositeOutputsMap);

+    static void SetTensorType(onnx::TensorProto& dst, CNTK::DataType dataType);
+
    //
    // Copy the content of NDArrayView to TensorProto, and do the needed
    // convergence.
@ -236,6 +247,12 @@ private:
    //
    static bool FilterInput(const FunctionPtr& src, const CNTK::Variable& input, size_t inputIndex);

+    //
+    // Converts axis (in CNTK C++ API sense) to index in ONNX sense assuming op may do broadcast
+    // across multiple inputs. In such case, it shall take the highest axis.  
+    //
+    static int64_t ConvertAxisToOnnxBroadcastOfOp(const Axis &axis, const FunctionPtr &src);
+    
    //
    // Converts axis (in CNTK C++ API sense) to index in ONNX sense
    //
@ -256,6 +273,11 @@ private:
    //
    static LotusIR::Node* AddNode(const FunctionPtr& src, LotusIR::Graph* graph, const std::vector<LotusIR::NodeArg*>& inputs, const std::vector<LotusIR::NodeArg* >& outputs);

+    //
+    // set node attribute for ReduceElements ops
+    // 
+    static void SetReduceElementsAttributes(const FunctionPtr src, Node *node);
+
    //
    // Get ONNX 'pads' attribute value based on CNTK node's autoPadding attribute value.
    //
@ -616,7 +638,7 @@ void AppendCNTKWeightToONNXTensor(DType *data, const NDShape &shape, onnx::Tenso
    }
 }

-void SetTensorType(onnx::TensorProto& dst, CNTK::DataType dataType)
+void CNTKToONNXHelper::SetTensorType(onnx::TensorProto& dst, CNTK::DataType dataType)
 {
    switch (dataType)
    {
@ -1196,18 +1218,35 @@ bool IsUnSupportedLayerNormalization(const FunctionPtr src)
    return cntkOpName == "LayerNormalization" && src->Output().HasSequenceAxis();
 }

+bool MatchOpSequence(const FunctionPtr src, std::vector<wstring> opSequence, FunctionPtr &op)
+{
+    FunctionPtr currentOp = src;
+    for (auto opName : opSequence)
+    {
+        if (currentOp == nullptr || currentOp->OpName() != opName)
+        {
+            return false;
+        }
+        currentOp = currentOp->Inputs().size() == 1 ? currentOp->Inputs()[0].Owner() : nullptr;
+    }
+    op = currentOp;
+    return true;
+}
+
+// when importing ONNX models, we insert a sequence of ops to pack/uppack batch/sequence axis
+// thoes ops shall be removed to create an equivalent ONNX model. 
 FunctionPtr SkipBatchAndSequenceAxisOp(const FunctionPtr src)
 {
-    if ((src->OpName() == L"ToSequenceOp" && src->Inputs()[0].Owner() &&
-        src->Inputs()[0].Owner()->OpName() == L"ToBatchAxis") ||
-        (src->OpName() == L"UnpackBatchAxis" && src->Inputs()[0].Owner() &&
-            src->Inputs()[0].Owner()->OpName() == L"UnpackSequenceOp"))
-        return src->Inputs()[0].Owner()->Inputs()[0].Owner();
-    else if (src->OpName() == L"UnpackBatchAxis" && src->Inputs()[0].Owner() &&
-        src->Inputs()[0].Owner()->OpName() == L"Sequence::Slice")
-        return src->Inputs()[0].Owner();
-    else
-        return src;
+    std::vector<wstring> toSequenceBatchOps({ L"ToSequenceOp", L"ToBatchAxis", L"TransposeAxes" }); 
+    std::vector<wstring> unpackSequenceBatchOps({ L"TransposeAxes", L"UnpackBatchAxis", L"UnpackSequenceOp" });
+    // std::vector<wstring> unpackBatchSequenceSliceOps({ L"UnpackBatchAxis", L"Sequence::Slice" });
+
+    FunctionPtr op = src;
+    while (MatchOpSequence(op, toSequenceBatchOps, op) || 
+        MatchOpSequence(op, unpackSequenceBatchOps, op)) 
+        // ||  MatchOpSequence(op, unpackBatchSequenceSliceOps, op))
+        ;
+    return op;
 }

 bool IsBatchAxisOp(const FunctionPtr src)
@ -1236,7 +1275,7 @@ bool IsBatchAxisOp(const FunctionPtr src)

 bool OpNeedONNXTypeMap(const std::string &cntkType)
 {
-    const vector<string> ops({"And", "Equal", "Greater", "Less", "Not", "Or", "Xor", "Gather", "ArgMax", "ArgMin", "TopK"});
+    const vector<string> ops({"And", "Equal", "Greater", "Less", "Not", "Or", "Xor", "Gather", "ArgMax", "ArgMin", "TopK" });
    for (auto o : ops)
    {
        if (cntkType == o)
@ -1365,6 +1404,16 @@ bool CNTKToONNXHelper::FilterInput(const FunctionPtr& src, const CNTK::Variable&
    return false;
 }

+int64_t CNTKToONNXHelper::ConvertAxisToOnnxBroadcastOfOp(const Axis &axis, const FunctionPtr &src)
+{
+    int64_t onnx_axis = 0;
+    for (int i = 0; i < src->Inputs().size(); i++)
+    {
+        onnx_axis = std::max(onnx_axis, ConvertAxisToOnnx(axis, src->Inputs()[i]));
+    }
+    return onnx_axis;
+}
+
 /*
 CNTK python static axis is zero based. Batch and Sequence axis is not static axis.
 CNTK cpp get static axis in a sanitized form (e.g. -axis - 1 by sanitize_axis)
@ -2321,6 +2370,44 @@ LotusIR::Node *CNTKToONNXHelper::CreateRNNNode(const FunctionPtr &src,
    return squeezedRNNNode;
 }

+// Create an ONNX NodeArg of desired shape with constant 0s as initial values. 
+LotusIR::NodeArg &CNTKToONNXHelper::AddZerosConstantNodeArg(Graph *graph, const string &nodeArgName,
+    const std::vector<int64_t> &shape, CNTK::DataType dataType)
+{
+    onnx::TypeProto shapeInputArgType = ToTypeProto(shape, false);
+    shapeInputArgType.mutable_tensor_type()->set_elem_type(ConvertDataTypeCNTKToTensorProto(dataType));
+    LotusIR::NodeArg &shapeInputArg = graph->GetOrCreateNodeArg(nodeArgName, &shapeInputArgType);
+
+    onnx::TensorProto dstTensor;
+    dstTensor.set_name(shapeInputArg.Name());
+    dstTensor.set_data_type(ConvertDataTypeCNTKToTensorProto(dataType));
+
+    if (std::any_of(shape.begin(), shape.end(), [](int64_t dim) {return dim <= 0; }))
+        LogicError("Invalid splice inputs shape");
+
+    int64_t totalSize = std::accumulate(shape.begin(), shape.end(), (int64_t)1, std::multiplies<int64_t>());
+    switch (dataType)
+    { 
+    case CNTK::DataType::Float16:
+        dstTensor.mutable_int32_data()->Resize((int)totalSize, 0);
+        break;
+    case CNTK::DataType::Float:
+        dstTensor.mutable_float_data()->Resize((int)totalSize, (float)0);
+        break;
+    case CNTK::DataType::Double:
+        dstTensor.mutable_double_data()->Resize((int)totalSize, 0);
+        break;
+    default:
+        NOT_IMPLEMENTED;
+    }
+
+    for (int index = 0; index < shape.size(); index++)
+        *(dstTensor.mutable_dims()->Add()) = shape[index];
+
+    graph->AddInitializedTensor(dstTensor);
+    return shapeInputArg;
+}
+
 LotusIR::Node *CNTKToONNXHelper::AddReshapeNodeAccordingToONNXVersion(Graph *graph, const string &nodeName, NodeArg *input, NodeArg *output, const std::vector<int64_t> &newShape)
 {
    if (IsONNX1_2Supported())
@ -2384,6 +2471,16 @@ LotusIR::Node *CNTKToONNXHelper::AddMatMulNode(LotusIR::NodeArg &nodeArg1, Lotus
    return argMatMulNode;
 }

+LotusIR::Node *CNTKToONNXHelper::AddAddNode(LotusIR::NodeArg &nodeArg1, LotusIR::NodeArg &nodeArg2, LotusIR::Graph* graph,
+    const std::string &out_arg_name)
+{
+    LotusIR::NodeArg &outputArg = graph->GetOrCreateNodeArg(out_arg_name, nullptr);
+    LotusIR::Node* argMatMulNode = graph->AddNode(
+        nodeArg1.Name() + string("_add"), "Add", "", { &nodeArg1, &nodeArg2 }, { &outputArg });
+    return argMatMulNode;
+}
+
+
 LotusIR::Node *CNTKToONNXHelper::AddArgMaxNode(LotusIR::NodeArg &nodeArg, LotusIR::Graph* graph, int axis)
 {
    // LotusIR::NodeArg inputArg(nodeArg.Name(), nullptr);
@ -2583,13 +2680,16 @@ LotusIR::Node* CNTKToONNXHelper::CreateNode(const FunctionPtr& initialSrc,
                                           std::unordered_map<Variable, LotusIR::Node*>& variableNodes,
                                           const std::unordered_map<Variable, Variable>& compositeOutputsMap)
 {
-    auto iter = functionNodes.find(initialSrc);
+    // try to skip batch and sequence pack unpack
+    FunctionPtr src = SkipBatchAndSequenceAxisOp(initialSrc);
+    if (!src)
+        // TODO: it could be a input NodeArg.
+        return nullptr;
+
+    auto iter = functionNodes.find(src);
    if (iter != functionNodes.end())
        return iter->second;
    
-    // try to skip batch and sequence pack unpack
-    FunctionPtr src = SkipBatchAndSequenceAxisOp(initialSrc);
-
    LotusIR::Node* functionNode = nullptr;
    std::string cntkOpName = ToLegacyString(ToUTF8(src->OpName()));
    std::string onnxOpName = ToOPName(src);
@ -2652,7 +2752,23 @@ LotusIR::Node* CNTKToONNXHelper::CreateNode(const FunctionPtr& initialSrc,
        if (IsBatchAxisOp(src))
            return CreateNodeForBatchAxisOp(src, graph, functionNodes, variableNodes, compositeOutputsMap);
        else
-            LogicError("Node '%S': Unsupported outside the context of batch axis ops.", src->AsString().c_str());
+        {
+            // this is a normal use of UnpackBatchAxis. ONNX does not treat batch axis specially so 
+            // we shall skip the op.
+            auto blockMapping = src->Inputs()[0].BlockFunctionVariableMapping();
+            if (blockMapping.IsInitialized())
+                return CreateNode(blockMapping.Owner(),
+                    graph,
+                    functionNodes,
+                    variableNodes,
+                    compositeOutputsMap);
+            else if (src->Inputs()[0].Owner())
+                return CreateNode(src->Inputs()[0].Owner(),
+                    graph,
+                    functionNodes,
+                    variableNodes,
+                    compositeOutputsMap);
+        }
    }

    //
@ -2696,6 +2812,30 @@ LotusIR::Node* CNTKToONNXHelper::CreateNode(const FunctionPtr& initialSrc,
    return functionNode; 
 }

+Variable SkipBatchPackUnpack(Variable input)
+{
+    if (input.Owner() &&
+        (input.Owner()->OpName() == L"UnpackBatchAxis" || input.Owner()->OpName() == L"ToBatchAxis"))
+    {
+        return input.Owner()->Inputs()[0];
+    }
+    else
+        return input;
+}
+
+bool TryMatchNodeArgType(onnx::TypeProto &argType, LotusIR::Graph* graph, const std::string &nodeArgName)
+{
+    const NodeArg* inputNodeArg = graph->FindNodeArg(nodeArgName);
+    if (inputNodeArg)
+    {
+        onnx::TensorProto_DataType inputType = inputNodeArg->TypeAsProto()->tensor_type().elem_type();
+        argType.mutable_tensor_type()->set_elem_type(inputType);
+        return true;
+
+    }
+    return false;
+}
+
 void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
    LotusIR::Graph* graph,
    std::unordered_map<FunctionPtr, LotusIR::Node*>& functionNodes,
@ -2717,6 +2857,9 @@ void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
                LogicError("Node '%S': Placeholder isn't supported currently.", src->AsString().c_str());
        }

+        // UnpackBatchAxis and ToBatchAxis is a noop in ONNX 
+        input = SkipBatchPackUnpack(input);
+
        // Special case handling of LayerNormalization layer because it changes
        // ops dynamically based on value of inputs. If more such cases ops are seen,
        // this should be abstracted out from here.
@ -2743,6 +2886,13 @@ void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
        bool isConstant = (input.IsParameter() || input.IsConstant()) &&
            !Operators::IgnoreConstantAndParameter(src->OpName(), inputIndex);

+        //
+        // If this input is output, then it is the ouput of an up stream node. Recursively add all upstream nodes.
+        // Pretty much, we are doing DFS.
+        //
+        if (input.IsOutput())
+            CreateNode(input.Owner(), graph, functionNodes, variableNodes, compositeOutputsMap);
+
        onnx::TypeProto inputArgType;

        if (cntkOpName == "Splice")
@ -2770,7 +2920,16 @@ void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
                (*inputArgType.mutable_tensor_type()->mutable_shape()->mutable_dim())[0].set_dim_param(FreeSequenceDimParam);
        }

-        if (OpNeedONNXTypeMap(cntkOpName))
+        // TODO: if it is an identity op, we shall peek its input node to find the correct tensor element type.
+
+        if (onnxOpName == "Identity")
+        {
+            // shall match the type of the same name NodeArg from upstream. 
+            string inputNodeArgName = ToLegacyString(ToUTF8(input.Uid()));
+            if (!TryMatchNodeArgType(inputArgType, graph, inputNodeArgName))
+                UpdateONNXType(src->Inputs()[0].GetDataType(), inputArgType);
+        }
+        else if (OpNeedONNXTypeMap(cntkOpName))
        {
            MapAndUpdateONNXType(onnxOpName, true, inputIndex, input.GetDataType(), inputArgType);
        }
@ -2779,6 +2938,26 @@ void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
            UpdateONNXType(input.GetDataType(), inputArgType);
        }

+        //
+        // Leaf nodes are data entry to the graph and need their own node with only output arg.
+        //
+        if (isConstant)
+        {
+            if (variableNodes.find(input) == variableNodes.end())
+            {
+                if (input.IsParameter() || input.IsConstant())
+                {
+                    auto srcTensor = input.IsParameter() ? Parameter(input).Value() : Constant(input).Value();
+
+                    onnx::TensorProto dstTensor;
+                    dstTensor.set_name(inputName);
+                    CopyTensor(srcTensor, dstTensor, &inputArgType);
+
+                    graph->AddInitializedTensor(dstTensor);
+                }
+            }
+        }
+
        LotusIR::NodeArg &inputArg = graph->GetOrCreateNodeArg(inputName, &inputArgType);

        inputs.push_back(&inputArg);
@ -2824,32 +3003,6 @@ void CNTKToONNXHelper::ProcessInputs(const FunctionPtr& src,
            *(dstTensor.mutable_dims()->Add()) = newShapeVec.size();
            graph->AddInitializedTensor(dstTensor);
        }
-
-        //
-        // Leaf nodes are data entry to the graph and need their own node with only output arg.
-        //
-        if (isConstant)
-        {
-            if (variableNodes.find(input) == variableNodes.end())
-            {
-                if (input.IsParameter() || input.IsConstant())
-                {
-                    auto srcTensor = input.IsParameter() ? Parameter(input).Value() : Constant(input).Value();
-
-                    onnx::TensorProto dstTensor;
-                    dstTensor.set_name(inputName);
-                    CopyTensor(srcTensor, dstTensor, &inputArgType);
-
-                    graph->AddInitializedTensor(dstTensor);
-                }
-            }
-        }
-        //
-        // If this input is output, then it is the ouput of an up stream node. Recursively add all upstream nodes.
-        // Pretty much, we are doing DFS.
-        //
-        else if (input.IsOutput())
-            CreateNode(input.Owner(), graph, functionNodes, variableNodes, compositeOutputsMap);
    }
 }

@ -2861,7 +3014,14 @@ void CNTKToONNXHelper::ProcessOutputs(const FunctionPtr& src,
    for (const auto& output : src->Outputs())
    {
        auto outputArgType = ToTypeProto(output.Shape(), output.HasBatchAxis(), output.HasSequenceAxis());
-        if (OpNeedONNXTypeMap(onnxOpName))
+        if (onnxOpName == "Identity")
+        {
+            // shall match the type of this Identity node's input NodeArg.
+            string inputNodeArgName = ToLegacyString(ToUTF8(src->Inputs()[0].Uid()));
+            if (!TryMatchNodeArgType(outputArgType, graph, inputNodeArgName))
+                UpdateONNXType(src->Inputs()[0].GetDataType(), outputArgType);
+        }
+        else if (OpNeedONNXTypeMap(onnxOpName))
        {
            MapAndUpdateONNXType(onnxOpName, false, outputIndex, output.GetDataType(), outputArgType);
        }
@ -3036,28 +3196,9 @@ void CNTKToONNXHelper::CopyAttributes(const FunctionPtr& src, LotusIR::Node* nod
                node->AddAttribute(attributesMap[L"newShape"], ToINTS(shape));
            }
        }
-        else if ((src->OpName() == L"ReduceL1") || (src->OpName() == L"ReduceL2") || (src->OpName() == L"ReduceSumSquare"))
+        if (src->OpName() == L"ReduceL1" || src->OpName() == L"ReduceL2" || src->OpName() == L"ReduceSumSquare")
        {
-            auto keepReducedDimensions = (int64_t)((bool) src->Attributes()[L"reductionKeepDimensions"].Value<bool>() ? 1 : 0);
-            std::vector<Axis> reductionAxes;
-            if (src->Attributes().Contains(L"axisVec"))
-                reductionAxes = AsVector<Axis>(src->Attributes()[L"axisVec"].Value<std::vector<DictionaryValue>>());
-            else if (src->Attributes().Contains(L"axis"))
-                reductionAxes.push_back((Axis)(src->Attributes()[L"axis"].Value<Axis>()));
-
-            // Reduction on batch axis in CNTK removes the batch axis, even if keepdims is true. 
-            // For ONNX export we need to make sure we export keepdims as 0 (false). 
-            // The same applies for AllStaticAxes. 
-            if (reductionAxes.size() == 1 
-                && (reductionAxes[0] == Axis::DefaultBatchAxis() 
-                    || reductionAxes[0] == Axis::AllStaticAxes() 
-                    || reductionAxes[0] == Axis::AllAxes()))
-                keepReducedDimensions = 0;
-
-            node->AddAttribute(attributesMap[L"keepdims"], keepReducedDimensions);
-
-            std::vector<int64_t> axes = ConvertAxesToOnnx(reductionAxes, src->Inputs()[0]);
-            node->AddAttribute("axes", axes);
+            SetReduceElementsAttributes(src, node);
        }
        else if (src->OpName() == L"TransposeAxes")
        {
@ -3117,7 +3258,7 @@ void CNTKToONNXHelper::CopyAttributes(const FunctionPtr& src, LotusIR::Node* nod
        else if (src->OpName() == L"Splice")
        {
            Axis axis = (Axis)(src->Attributes()[L"axis"].Value<Axis>());
-            int64_t axisIndex = ConvertAxisToOnnx(axis, src->Inputs()[0]);
+            int64_t axisIndex = ConvertAxisToOnnxBroadcastOfOp(axis, src);
            node->AddAttribute(attributesMap[L"axis"], axisIndex);
        }
        else if (src->OpName() == L"Slice")
@ -3432,58 +3573,71 @@ void CNTKToONNXHelper::CopyAttributes(const FunctionPtr& src, LotusIR::Node* nod
        }
        else if (src->OpName() == L"ReduceElements")
        {
-            wstring cntkAttributeOpName = (wstring)src->Attributes()[PrimitiveFunctionAttribute::AttributeNameReductionOpName].Value<wstring>();
-            const AttributesMapping& attributeMap = Operators::FindAttributeMap(src->OpName(), cntkAttributeOpName);
-
-            auto keepReducedDimensions = (int64_t)((bool)src->Attributes()[L"reductionKeepDimensions"].Value<bool>() ? 1 : 0);
-
-            // hack to make reduction with sequence axis pass bi-directional broadcast
-            if (node->OpType() == "ReduceMean" && src->Inputs()[0].HasSequenceAxis())
-            {
-                keepReducedDimensions = 1;
-            }
-
-            if (src->Attributes().Contains(L"axisVec"))
-            {
-                std::vector<Axis> reductionAxes;
-                reductionAxes = AsVector<Axis>(src->Attributes()[L"axisVec"].Value<std::vector<DictionaryValue>>());
-                // Reduction on batch axis in CNTK removes the batch axis, even if keepdims is true. 
-                // For ONNX export we need to make sure we export keepdims as 0 (false). 
-                // The same applies for AllStaticAxes. 
-                if (reductionAxes.size() == 1 
-                    && (reductionAxes[0] == Axis::DefaultBatchAxis() 
-                        || reductionAxes[0] == Axis::AllStaticAxes() 
-                        || reductionAxes[0] == Axis::AllAxes()))
-                    keepReducedDimensions = 0;
-                std::vector<int64_t> axes = ConvertAxesToOnnx(reductionAxes, src->Inputs()[0]);
-                node->AddAttribute("axes", axes);
-            }
-            else if (src->Attributes().Contains(L"axis"))
-            {
-                // py axis -> cpp (-axis -1) -> normalize (rank + axis)
-                Axis axis = (Axis)(src->Attributes()[L"axis"].Value<Axis>());
-                // Reduction on batch axis in CNTK removes the batch axis, even if keepdims is true. 
-                // For ONNX export we need to make sure we export keepdims as 0 (false). 
-                // The same applies for All axes 
-                if (axis == Axis::DefaultBatchAxis() || axis == Axis::AllAxes() || axis == Axis::AllStaticAxes())
-                    keepReducedDimensions = 0;
-                if (node->OpType() != "ArgMax" && node->OpType() != "ArgMin")
-                {
-                    std::vector<int64_t> axes = ConvertAxesToOnnx(std::vector<Axis>({ axis }), src->Inputs()[0]);
-                    node->AddAttribute("axes", axes);
-                }
-                else
-                {
-                    int64_t ax = ConvertAxisToOnnx(axis, src->Inputs()[0]);
-                    node->AddAttribute("axis", ax);
-                }
-            } 
-
-            node->AddAttribute("keepdims", keepReducedDimensions);
+            SetReduceElementsAttributes(src, node);
        }
    }
 }

+void CNTKToONNXHelper::SetReduceElementsAttributes(const FunctionPtr src, Node *node)
+{
+    std::wstring reductionOpName = src->OpName();
+    if (reductionOpName == L"ReduceElements")
+    {
+        reductionOpName = src->Attributes()[L"reductionOpName"].Value<wstring>();
+    }
+
+    auto keepReducedDimensions = (int64_t)((bool)src->Attributes()[L"reductionKeepDimensions"].Value<bool>() ? 1 : 0);
+    bool forceKeepReducedDimensions = false;
+
+    if (src->Inputs()[0].HasSequenceAxis())
+    {
+        // TODO: IMPORTANT. this is a workaround related to how batch/sequence axes are unpacked and broadcased.
+        // in general, batch/sequence axes are moved to static axis position during unpacking and broadcasting. 
+        // as a result, a tensor may end up with duplicated batch/sequence axes.
+        // this is most often when there is a sequence axis. Set keepdims to 1 avoid 
+        // some of the cases but it is not the solution. 
+        // roughly here is a test code that would fail without this workaround:
+        // shape = (2, )
+        // batch_size = 1
+        // seq_len = 1
+        // data = generate_sequential_data((batch_size, seq_len, *shape))
+        // x1 = C.sequence.input_variable(shape)
+        // x1_reduced = C.reduce_mean(x1, 0, keepdims = False)
+        // model = x1 + x1_reduced
+        // model = C.reduce_mean(model, 0, keepdims = False)
+        // model.save(tmpdir + "/broadcast_sequence.onnx", format = C.ModelFormat.ONNX)
+        // loaded_model = C.Function.load(tmpdir + "/broadcast_sequence.onnx", format = C.ModelFormat.ONNX)
+        // o1 = loaded_model.eval({ loaded_model.arguments[0]: data })
+        keepReducedDimensions = 1;
+        forceKeepReducedDimensions = true;
+    }
+
+    std::vector<Axis> reductionAxes;
+    if (src->Attributes().Contains(L"axisVec"))
+        reductionAxes = AsVector<Axis>(src->Attributes()[L"axisVec"].Value<std::vector<DictionaryValue>>());
+    else if (src->Attributes().Contains(L"axis"))
+        reductionAxes.push_back((Axis)(src->Attributes()[L"axis"].Value<Axis>()));
+
+    // Reduction on batch axis in CNTK removes the batch axis, even if keepdims is true. 
+    // For ONNX export we need to make sure we export keepdims as 0 (false). 
+    // The same applies for AllStaticAxes. 
+    if (!forceKeepReducedDimensions &&
+        (reductionAxes.size() == 1
+            && (reductionAxes[0] == Axis::DefaultBatchAxis()
+                || reductionAxes[0] == Axis::AllStaticAxes()
+                || reductionAxes[0] == Axis::AllAxes())))
+        keepReducedDimensions = 0;
+    std::vector<int64_t> axes = ConvertAxesToOnnx(reductionAxes, src->Inputs()[0]);
+
+    if (reductionOpName == L"Argmax" || reductionOpName == L"Argmin")
+        node->AddAttribute("axis", axes[0]);
+    else
+        if (reductionAxes[0] != Axis::AllAxes())
+            node->AddAttribute("axes", axes); 
+
+    node->AddAttribute("keepdims", keepReducedDimensions);
+}
+
 void CNTKToONNXHelper::PutAutopadOrPadAttrInNode(LotusIR::Node* node,
                                                 const std::vector<bool>& autoPadding, const NDShape& kernelShape, bool ceilOutDim)
 {
@ -3550,6 +3704,86 @@ LotusIR::Node* FindByName(LotusIR::Graph* graph, const std::string &name)
    return nullptr;
 }

+std::vector<int64_t> GetShapeFromNodeArg(LotusIR::NodeArg *nodeArg)
+{
+    std::vector<int64_t> shape;
+    const TypeProto *typeProto = nodeArg->TypeAsProto();
+    for (int dim = 0; dim < typeProto->tensor_type().shape().dim_size(); dim++)
+    {
+        shape.push_back(typeProto->tensor_type().shape().dim()[dim].dim_value());
+    }
+    return shape;
+}
+
+// CNTK splice allows broadcast of inputs before applying concatination. 
+// ONNX Concat is limited to matching input shape cases
+// i.e. inputs' dimensions shall be the equal except for the concatination axis.
+// for an example, see test_Concat_With_Broadcast in onnx_op_test.py.
+void CNTKToONNXHelper::BroadcastInputsIfNeeded(std::vector<LotusIR::NodeArg *> &orderedInputs, const FunctionPtr& src, LotusIR::Graph* graph)
+{
+    if (src->OpName() != L"Splice")
+        return;
+
+    Axis axis = (Axis)(src->Attributes()[L"axis"].Value<Axis>());
+    int64_t concatAxis = ConvertAxisToOnnxBroadcastOfOp(axis, src);
+    std::vector<std::vector<int64_t>> shapes;
+    int max_rank = 0;
+    for (auto nodeArg : orderedInputs)
+    {
+        shapes.push_back(GetShapeFromNodeArg(nodeArg));
+        max_rank = std::max(max_rank, shapes.rbegin()->size());
+    }
+
+    std::vector<int64_t> broadcast_shape(max_rank, 1);
+    for (int i = 0; i < shapes.size(); i++)
+    {
+        std::vector<int64_t> &shape_i = shapes[i];
+        for (int index_to_shape_i = 0; index_to_shape_i < shape_i.size(); index_to_shape_i++)
+        {
+            int onnx_axis = index_to_shape_i + (max_rank - shape_i.size());
+            if (onnx_axis == concatAxis)
+                // only check and update no-concat_axis dimensions
+                continue;
+            else if (broadcast_shape[onnx_axis] == 1)
+                broadcast_shape[onnx_axis] = shape_i[index_to_shape_i];
+            else if (broadcast_shape[onnx_axis] != shape_i[index_to_shape_i] && shape_i[index_to_shape_i] != 1)
+                LogicError("Invalid splice inputs shape");
+        }
+    }
+
+    // TODO: use ONNX Expand once ONNX version 7 is supported
+    // Without Expand op, we create a zeros constant of expected shape and apply broadcast add
+    // to get input to the right shape for concatination.    
+    for (int i = 0; i < orderedInputs.size(); i++)
+    {
+        std::vector<int64_t> &shape_i = shapes[i];
+        bool need_broadcast = shape_i.size() < max_rank;
+        while (shape_i.size() < max_rank)
+            shape_i.insert(shape_i.begin(), 1);
+
+        for (int onnx_axis = 0; onnx_axis < shape_i.size(); onnx_axis++)
+        {
+            if (onnx_axis != concatAxis && shape_i[onnx_axis] != broadcast_shape[onnx_axis])
+            {
+                shape_i[onnx_axis] = broadcast_shape[onnx_axis];
+                need_broadcast = true;
+            }
+        }
+
+        if (!need_broadcast)
+            continue;
+
+        LotusIR::NodeArg *nodeArg = orderedInputs[i];
+
+        // We insert an "Add" with broadcast to get desired shape that can be accepted by ONNX Concat. 
+        LotusIR::NodeArg &nodeArg2 = AddZerosConstantNodeArg(graph, nodeArg->Name() + "_braodcast_for_desired_shape",
+            shape_i, src->Inputs()[i].GetDataType());
+        const std::string out_arg_name = nodeArg->Name() + "_post_braodcasted_with_desired_shape";
+        LotusIR::Node *node = AddAddNode(*nodeArg, nodeArg2, graph, out_arg_name);
+        orderedInputs[i] = const_cast<NodeArg*>(node->OutputDefs()[0]);
+    }
+}
+
 LotusIR::Node* CNTKToONNXHelper::AddNode(const FunctionPtr& src, LotusIR::Graph* graph, const std::vector<LotusIR::NodeArg *>& inputs, const std::vector<LotusIR::NodeArg *>& outputs)
 {
    LotusIR::Node* node = nullptr;
@ -3672,6 +3906,11 @@ LotusIR::Node* CNTKToONNXHelper::AddNode(const FunctionPtr& src, LotusIR::Graph*
            node = graph->AddNode(nodeName + string("_add"), "Add",
                                  "", { &mulTensorOutputArg, input2 }, { &addTensorOutputArg });
        }
+        else if (src->OpName() == L"Splice")
+        {
+            BroadcastInputsIfNeeded(orderedInputs, src, graph);
+            node = graph->AddNode(nodeName, ToOPName(src), "", orderedInputs, outputs);
+        }
        else
            node = graph->AddNode(nodeName, ToOPName(src), "", orderedInputs, outputs);
    }
--- a/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
@ -59,7 +59,7 @@ private:
    static bool FixConstantShapeForConstantVariableInputPair(const std::vector<Variable> &inputs,
                                                             std::vector<Variable> &fixedInputs);

-    static const Node *GetChildNode(const Node *parentNode, const NodeArg *nodeArg);
+    static const Node *GetChildNode(const Node *parentNode, const NodeArg *nodeArg, int &nodeArgIndex);

    static std::vector<Axis> AttributeProtoToAxes(const AttributeProto &attributeProto);
    static Axis AttributeProtoToAxis(const AttributeProto &attributeProto);
@ -132,6 +132,8 @@ private:
    static std::pair<std::vector<size_t>, std::vector<size_t>> AdjustONNXPadsVecForCNTKPadOp(const Variable &operand, std::vector<int64_t> &pads);
    static NDShape ReverseShape(const NDShape &shape);

+    static std::pair<std::vector<Axis>, bool> GetReduceElementsAttributes(const Node *node, const Variable &input);
+
    static std::pair<Variable, Variable> BroadcastElementWiseInput(const Node *node,
                                                                   const Variable &input0, const Variable &input1);

@ -601,14 +603,16 @@ const CNTK::Constant CNTK::ONNXToCNTKHelper::CreateConstantWithTensorData(CNTK::
    }
 }

-const Node *ONNXToCNTKHelper::GetChildNode(const Node *parentNode, const NodeArg *nodeArg)
+const Node *ONNXToCNTKHelper::GetChildNode(const Node *parentNode, const NodeArg *nodeArg, int &nodeArgIndex)
 {
    Node::NodeConstIterator itChildNode = parentNode->InputNodesBegin();
    for (; itChildNode != parentNode->InputNodesEnd(); ++itChildNode)
    {
        const Node *childNode = *itChildNode;
        const ConstPointerContainer<std::vector<NodeArg *>> &childOutputDefs = childNode->OutputDefs();
-        for (ConstPointerContainer<std::vector<NodeArg *>>::ConstIterator itChildOutput = childOutputDefs.begin(); itChildOutput != childOutputDefs.end(); ++itChildOutput)
+        nodeArgIndex = 0;
+        for (ConstPointerContainer<std::vector<NodeArg *>>::ConstIterator itChildOutput = childOutputDefs.begin(); 
+            itChildOutput != childOutputDefs.end(); ++itChildOutput, nodeArgIndex++)
        {
            const NodeArg *childOutput = *itChildOutput;
            if (childOutput == nodeArg)
@ -1838,6 +1842,29 @@ std::pair<Variable, Variable> ONNXToCNTKHelper::BroadcastElementWiseInput(
    }
 }

+std::pair<std::vector<Axis>, bool> ONNXToCNTKHelper::GetReduceElementsAttributes(const Node *node, const Variable &input)
+{
+    bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
+    std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), input);
+
+    // use default of all axes according to ONNX
+    if (axes.empty())
+    {
+        if (keepdims)
+            axes = vector<Axis>({ Axis::AllAxes() });
+        else
+        {
+            // In the case of keepdims being false, CNTK does not allow reduce on Axis::AllAxes(). 
+            // We have to list out all axes instead. 
+            if (input.DynamicAxes().size() != 0)
+                LogicError("ReduceElements with default on all axes is not supported with input of dynamic axis.");
+            axes.resize(input.Shape().Rank());
+            std::generate(axes.begin(), axes.end(), [static_axis = 0]() mutable { return Axis(static_axis++); });
+        }
+    }
+    return std::make_pair(axes, keepdims);
+}
+
 Axis ONNXToCNTKHelper::ConvertONNXAxisToCNTKCppApi(int64_t axis, const Variable &operand)
 {
    // reverse CNTKToONNXHelper::ConvertAxisToOnnx
@ -2487,127 +2514,91 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
    }
    else if (onnxOpName == "ReduceMax")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceMax(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceMin")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceMin(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceSum")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceSum(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceMean")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceMean(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceProd")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;
+
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
-        
-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceProd(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceLogSumExp" || onnxOpName == "ReduceLogSum")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;
+
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
-        
-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceLogSum(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceL1")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceL1(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceL2")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceL2(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
    else if (onnxOpName == "ReduceSumSquare")
    {
-        std::vector<Axis> axes = ConvertONNXAxesToCNTKCppApi(GetNamedAttributeAsInt64Vec(node, "axes", vector<int64_t>({})), inputs[0]);
+        bool keepdims;
+        std::vector<Axis> axes;

-        // use default of all axes according to ONNX
-        if (axes.empty())
-        {
-            axes = vector<Axis>({ Axis::AllAxes() });
-        }
+        std::tie<std::vector<Axis>, bool>(axes, keepdims) = GetReduceElementsAttributes(node, inputs[0]);

-        bool keepdims = GetNamedAttributeAsInt64(node, "keepdims", 1) == 1;
        FunctionPtr cntkFunction = ReduceSumSquare(inputs[0], axes, keepdims, ToFixedWStringFromMultiByte(node->Name()));
        return cntkFunction;
    }
@ -2636,7 +2627,8 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
            newShape = GetShapeFromInput(node->InputDefs()[1], graph);
        }

-        const Node *childNode = GetChildNode(node, node->InputDefs()[0]);
+        int nodeArgIndexDummy = 0;
+        const Node *childNode = GetChildNode(node, node->InputDefs()[0], nodeArgIndexDummy);
        if (childNode != nullptr && Operators::IsRNNOp(childNode->OpType()))
        {
            // Adjust for batch and sequence axes swap between CNTK and ONNX.
@ -2883,26 +2875,6 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
    }
 }

-std::pair<const Node *, int> FindParent(const Node *node)
-{
-    Node::NodeConstIterator it = node->OutputNodesBegin();
-    if (it != node->OutputNodesEnd())
-    {
-        const Node *parent = *it;
-        int index = 0;
-        for (auto nodeArg : parent->InputDefs())
-        {
-            // TODO: Check whether we should use node output arg name for the check below.
-            if (nodeArg->Name() == node->Name())
-            {
-                return std::make_pair(parent, index);
-            }
-            index++;
-        }
-    }
-    return std::make_pair(nullptr, -1);
-}
-
 std::pair<const Node *, int> FindParentAndChildIndex(const Node *node)
 {
    Node::NodeConstIterator it = node->OutputNodesBegin();
@ -3329,13 +3301,20 @@ std::vector<Variable> ONNXToCNTKHelper::CreateCNTKInputsStartingFromIndex(const
    for (int i = startIndex; i < inputDefs.size(); i++)
    {
        const NodeArg *nodeArg = inputDefs[i];
-        const Node *inputNode = GetChildNode(node, nodeArg);
+        // nodeArg may be one of outputDefs from another node inputNode
+        // in case there are multiple outputDefs, we need to know the index of the nodeArg
+        int nodeArgIndex = 0;
+        const Node *inputNode = GetChildNode(node, nodeArg, nodeArgIndex);
        if (inputNode != nullptr)
        {
            ONNXToCNTKMap::iterator itNodeMap = constructedNodeMap.find(const_cast<Node *>(inputNode));
            if (itNodeMap != constructedNodeMap.end())
            {
-                inputs.insert(inputs.end(), itNodeMap->second.begin(), itNodeMap->second.end());
+                std::vector<FunctionPtr> inputCNTKFunctionPtrs = itNodeMap->second;
+                for (auto f : inputCNTKFunctionPtrs)
+                {
+                    inputs.insert(inputs.end(), f->Outputs()[nodeArgIndex]);
+                }
            }
            else
            {
--- a/Source/CNTKv2LibraryDll/proto/onnx/Operators.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/Operators.cpp
@ -508,6 +508,7 @@ namespace ONNX
            { L"Softsign",{ 0 } },
            { L"ImageScaler",{ 0, 1, 2, 3 } },
            { L"MeanVarianceNormalization",{ 0 } },
+            { L"Sequence::Slice",{ 0, 1 } },
        };

        std::unordered_map<std::wstring, std::vector<int>> Operators::_cntkToONNXInputIndices = {
--- a/bindings/python/cntk/tests/onnx_op_test.py
+++ b/bindings/python/cntk/tests/onnx_op_test.py
@ -83,7 +83,8 @@ def verify_one_input(model, data, tmpdir, name, device=None, loaded_model=None,

    # TODO: it is better to compare data.shape with model.arguments[0] and
    # to pad batch dimension as needed.
-    if model.arguments[0].has_batch_axis():
+    # Some tests have already expanded batch axis to data (i.e. reduction test) 
+    if model.arguments[0].has_batch_axis() and type(data)!=list:
        data.shape = (1, ) + data.shape

    assert len(model.outputs) == len(loaded_model.outputs)
@ -444,6 +445,22 @@ def test_Concat(tmpdir, dtype):

        verify_one_input(model, data1, tmpdir, 'Concat_1')

+@pytest.mark.parametrize("dtype", DType_Config)
+def test_Concat_With_Broadcast(tmpdir, dtype):
+    with C.default_options(dtype = dtype):
+        shape1 = [2,3,1,1,3]
+        shape2 =   [1,3,4,1]
+        shape3 =     [3,4,1]
+        axis = 2
+        data1 = np.random.uniform(-10, 10, shape1).astype(dtype)
+        data2 = np.random.uniform(-10, 10, shape2).astype(dtype)
+        data3 = np.random.uniform(-10, 10, shape3).astype(dtype)
+        x = C.input_variable(shape1)
+        y = C.constant(value=data2)
+        z = C.constant(value=data3)
+        model = C.splice(x, y, z, axis=axis)
+        verify_one_input(model, data1, tmpdir, 'Concat_Braodcast')
+
@pytest.mark.parametrize("dtype", DType_Config)
 def test_Conv(tmpdir, dtype, device_id):
    if device_id == -1 and dtype == np.float16: