CNTK V2 library: Enabled non scalar initial value for Past/Future value nodes and other bug fixes

2016-09-17 14:27:27 -07:00 · 2016-09-17 14:27:27 -07:00 · 867ace750c
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -286,7 +286,8 @@ namespace CNTK
        ///
        std::wstring AsString() const
        {
-            std::wstringstream wStrStream(L"{");
+            std::wstringstream wStrStream;
+            wStrStream << L"{";
            for (size_t i = 0; i < Rank(); i++)
            {
                if (i != 0)
@ -2491,7 +2492,7 @@ namespace CNTK
    ///
    /// Create an instance of the CNTK built-in splice operation to splice together all the specified tensor operands into a single output tensor
    ///
-    CNTK_API FunctionPtr Splice(const std::vector<Variable>& operands, size_t axis, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Splice(const std::vector<Variable>& operands, const Axis& axis, const std::wstring& name = L"");

    ///
    /// Create a new Function instance which just combines the outputs of the specified list of 'operands' Functions such that the 'Outputs' of the 
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -195,8 +195,6 @@ namespace CNTK
                    auto initialStateVar = Constant::Scalar(node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.push_back(initialStateVar);
                }
-                else
-                    LogicError("LoadLegacyModel: Currently loading models with non-scalar initial value for PastValueNode/FutureValueNode is unsupported");

                primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameOffset] = (size_t)node->As<PastValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::PastValue;
@ -208,8 +206,6 @@ namespace CNTK
                    auto initialStateVar = Constant::Scalar(node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.push_back(initialStateVar);
                }
-                else
-                    LogicError("LoadLegacyModel: Currently loading models with non-scalar initial value for PastValueNode/FutureValueNode is unsupported");

                primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameOffset] = (size_t)node->As<FutureValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::FutureValue;
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -134,6 +134,7 @@
    <ClInclude Include="Utils.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
+    <ClInclude Include="Value.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="BackCompat.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -32,6 +32,7 @@
    <ClInclude Include="API\CNTKLibraryExperimental.h">
      <Filter>API</Filter>
    </ClInclude>
+    <ClInclude Include="Value.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -474,9 +474,8 @@ namespace CNTK
            if (!axis1.IsStaticAxis() || !axis2.IsStaticAxis())
                LogicError("TransposeAxes operation currently does not support transposing dynamic axes");

-            auto transposedTensorShape = AsTensorShape(inputs[0].Shape());
-            transposedTensorShape.SwapDimsInPlace(axis1.StaticAxisIndex(), axis2.StaticAxisIndex());
-            outputShape = AsNDShape(transposedTensorShape);
+            outputShape = inputs[0].Shape();
+            std::swap(outputShape[axis1.StaticAxisIndex()], outputShape[axis2.StaticAxisIndex()]);
            break;
        }
        case PrimitiveOpType::Slice:
@ -507,7 +506,7 @@ namespace CNTK
            if ((axis.StaticAxisIndex() < outputTensorShape.GetRank()) && (0 <= realBeginIndex) && (realBeginIndex <= realEndIndex) && (realEndIndex <= sliceAxisDim))
                outputTensorShape.NarrowTo(axis.StaticAxisIndex(), realBeginIndex, realEndIndex);

-            outputShape = AsNDShape(outputTensorShape);
+            outputShape = AsNDShape(outputTensorShape, /*allowNonFlattenableTensorShapes = */ true);
            break;
        }
        case PrimitiveOpType::Reshape:
@ -611,15 +610,11 @@ namespace CNTK
            Variable inputOperandVar = inputs[0];
            Variable initialStateVar = inputs[1];

-            // TODO: Current we only support a scalar initial state
-            if (!initialStateVar.IsConstant() || (initialStateVar.Shape().Rank() > 0))
-                LogicError("Currently PastValue/FutureValue Function only supports scalar initial state");
-
            // TODO: We currently only support input operand with 1 dynamic axis for PastValue/FutureValue
            if (inputOperandVar.DynamicAxes().size() != 2)
                LogicError("Currently PastValue/FutureValue Function only supports input operand with with 2 dynamic axis (1 sequence-axis and 1 batch-axis)");

-            outputShape = UnaryElementwiseOpOutputShape(inputs[0].Shape());
+            outputShape = BinaryElementwiseOpOutputShape(op, inputs[0].Shape(), inputs[1].Shape());
            break;
        }
        case PrimitiveOpType::ReduceElements:
@ -975,16 +970,11 @@ namespace CNTK
            Variable inputOperandVar = functionInputs[0];
            Variable initialStateVar = functionInputs[1];

-            // Get the intial state of the PastValue/FutureValue operation
-            ElementType initStateValue;
-            NDArrayView tempView({}, &initStateValue, 1, DeviceDescriptor::CPUDevice());
-            tempView.CopyFrom(*(Constant(initialStateVar).Value()));
-
            size_t offset = primitiveFunction->Attributes()[PrimitiveFunction::AttributeNameOffset].Value<size_t>();
            if (op == PrimitiveOpType::PastValue)
-                computationNodePtr = New<PastValueNode<ElementType>>(network->GetDeviceId(), functionName, (float)initStateValue, AsTensorShape(inputOperandVar.Shape()), offset);
+                computationNodePtr = New<PastValueNode<ElementType>>(network->GetDeviceId(), functionName, AsTensorShape(inputOperandVar.Shape()), offset);
            else
-                computationNodePtr = New<FutureValueNode<ElementType>>(network->GetDeviceId(), functionName, (float)initStateValue, AsTensorShape(inputOperandVar.Shape()), offset);
+                computationNodePtr = New<FutureValueNode<ElementType>>(network->GetDeviceId(), functionName, AsTensorShape(inputOperandVar.Shape()), offset);

            break;
        }
@ -1043,9 +1033,14 @@ namespace CNTK
        // Let's reorder inputNodesBasePtrs properly since the ordering of inputs of CNTK internal ComputationNode may be different from the PrimitiveFunction inputs ordering
        ReorderAsCNTKComputationNodeInputs(op, inputNodesBasePtrs);
        if (computationNodePtr->Is<INumInputs>())
-            inputNodesBasePtrs.resize(computationNodePtr->As<INumInputs>()->GetExpectedNumInputs());
-        else if ((op == PrimitiveOpType::PastValue) || (op == PrimitiveOpType::FutureValue)) // TODO: Temporary hack to be replaced with support for non-scalar ininital state value operands
-            inputNodesBasePtrs.resize(1);
+        {
+            auto computationNodeExpectedInputCount = computationNodePtr->As<INumInputs>()->GetExpectedNumInputs();
+            if (computationNodeExpectedInputCount != inputNodesBasePtrs.size())
+                LogicError("Input count mismatch: The Primitive function for op %s has %d inputs while the corresponding ComputationNode has %d inputs",
+                           PrimitiveOpTypeName(op),
+                           inputNodesBasePtrs.size(),
+                           computationNodeExpectedInputCount);
+        }

        network->AddNodeToNetAndAttachInputs(computationNodePtr, inputNodesBasePtrs);

@ -1185,6 +1180,9 @@ namespace CNTK
                }
            }

+#ifdef _DEBUG
+            m_computationNetwork->SetTraceLevel(1);
+#endif
            m_computationNetwork->CompileNetwork();

            // Verify that the shapes of the output Variables that we computed match the corresponding nodes in the ComputationNetwork
@ -1237,6 +1235,14 @@ namespace CNTK
        if (var.DynamicAxes().size() > 2)
            LogicError("More than 2 dynamic axis for a variable is currently unsupported");

+        //if (value->Data()->Shape().SubShape(0, var.Shape().Rank()) != var.Shape())
+        //{
+        //    InvalidArgument("The %s dimensions of the Value shape (%s) do not match the shape of the variable (%s) that it corresponds to!", 
+        //                    Internal::IsReversingTensorShapesInErrorMessagesEnabled() ? "trailing" : "leading",
+        //                    AsStringForErrorReporting(value->Data()->Shape()).c_str()),
+        //                    AsStringForErrorReporting(var.Shape()).c_str()));
+        //}
+
        size_t maxNumTimeSteps = value->Data()->Shape()[var.Shape().Rank()];
        size_t numSequences = value->Data()->Shape()[var.Shape().Rank() + 1];

@ -1280,9 +1286,7 @@ namespace CNTK
                            currentSequenceLength++;
                        }
                        else
-                        {
                            currentSequenceEndAlreadyFound = true;
-                        }
                    }

                    sequenceLengths[i] = currentSequenceLength;
@ -1595,13 +1599,36 @@ namespace CNTK
                                                            const DeviceDescriptor& computeDevice,
                                                            const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
    {
-        // TODO: How about zero argument functions?
+        // Validate arguments and outputs
+        if (outputs.empty())
+            InvalidArgument("CompositeFunction::Forward: At least one output has to be specified!");
+
+        // Make sure that the DataType of the variables and corresponding values match
        // TODO: We need a better way to determine the ElementType for the network
-        auto dataType = arguments.begin()->second->Data()->GetDataType();
+        auto dataType = DataType::Unknown;
+        for (auto variableValuePair : arguments)
+        {
+            if (dataType == DataType::Unknown)
+                dataType = variableValuePair.first.GetDataType();
+            else if (dataType != variableValuePair.first.GetDataType())
+                LogicError("CompositeFunction::Forward: The DataType of all arguments of the Function must be same");
+        }
+
+        if (dataType == DataType::Unknown)
+        {
+            for (auto variableValuePair : outputs)
+            {
+                if (dataType == DataType::Unknown)
+                    dataType = variableValuePair.first.GetDataType();
+            }
+        }
+
        if (dataType == DataType::Float)
            GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor, true);
-        else
+        else if (dataType == DataType::Double)
            GetComputationNetwork<double>(computeDevice, outputsToRetainBackwardStateFor, true);
+        else
+            InvalidArgument("Unsupported DataType %s", DataTypeName(dataType));

        // TODO: Avoid copying the data when possible

@ -2075,10 +2102,13 @@ namespace CNTK
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Select, std::vector<Variable>({ condition, leftOperand, rightOperand }), Dictionary(), name), name);
    }

-    FunctionPtr Splice(const std::vector<Variable>& operands, size_t axis, const std::wstring& name /*= L""*/)
+    FunctionPtr Splice(const std::vector<Variable>& operands, const Axis& axis, const std::wstring& name /*= L""*/)
    {
+        if (!axis.IsStaticAxis())
+            LogicError("Splice: Currently only splicing along a static axis is supported");
+
        auto additionalProperties = Dictionary();
-        additionalProperties[PrimitiveFunction::AttributeNameAxis] = Axis(axis);
+        additionalProperties[PrimitiveFunction::AttributeNameAxis] = axis;

        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Splice, operands, std::move(additionalProperties), name), name);
    }
@ -2209,15 +2239,23 @@ namespace CNTK

        FunctionPtr ZeroesLike(const Variable& operand)
        {
-            if (operand.Shape().Rank() > 1)
-                LogicError("Internal::ZeroesLike: Currently only 1D inputs are supported!");
-
            if (operand.IsSparse())
+            {
+                if (operand.Shape().Rank() > 1)
+                    LogicError("Internal::ZeroesLike: Currently only 1D sparse inputs are supported!");
+
                return Times(Constant({ 1, operand.Shape()[0] }, operand.GetDataType(), 0.0), operand);
+            }
            else
            {
                auto rowSliceFunc = Internal::Slice(operand, Axis(0), 0, 1);
-                return Minus(rowSliceFunc, rowSliceFunc);
+                auto output = Minus(rowSliceFunc, rowSliceFunc);
+
+                // Reduce away all but the static axis 0
+                for (size_t i = 1; i < output->Output().Shape().Rank(); ++i)
+                    output = ReduceSum(output, Axis(i));
+
+                return output;
            }
        }

--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -71,13 +71,16 @@ namespace CNTK
            return DeviceDescriptor::GPUDevice(deviceId);
    }

-    inline NDShape AsNDShape(const Microsoft::MSR::CNTK::TensorShape& tensorShape)
+    inline NDShape AsNDShape(const Microsoft::MSR::CNTK::TensorShape& tensorShape, bool allowNonFlattenableTensorShapes = false)
    {
-        // The TensorShape should be flattenable to 1D
-        for (size_t i = 1; i < tensorShape.GetRank(); ++i)
+        if (!allowNonFlattenableTensorShapes)
        {
-            if (!tensorShape.CanFlatten(i))
-                InvalidArgument("AsNDShape() can only be called for TensorShapes that can be flattened to 1D");
+            // The TensorShape should be flattenable to 1D
+            for (size_t i = 1; i < tensorShape.GetRank(); ++i)
+            {
+                if (!tensorShape.CanFlatten(i))
+                    InvalidArgument("AsNDShape() can only be called for TensorShapes that can be flattened to 1D");
+            }
        }

        return std::vector<size_t>(tensorShape.GetDims().begin(), tensorShape.GetDims().end());
--- a/Tests/UnitTests/V2LibraryTests/Common.h
+++ b/Tests/UnitTests/V2LibraryTests/Common.h
@ -193,6 +193,23 @@ std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPComponentWithSelfStabilizat
    return { LSTMCell.first, LSTMCell.second };
 }

+// This is currently unused
+inline CNTK::FunctionPtr SimpleRecurrentLayer(const  CNTK::Variable& input, const  CNTK::NDShape& outputDim, const std::function<CNTK::FunctionPtr(const CNTK::Variable&)>& recurrenceHook, const CNTK::DeviceDescriptor& device)
+{
+    auto dh = CNTK::PlaceholderVariable(outputDim, input.DynamicAxes());
+
+    unsigned long seed = 1;
+    auto createProjectionParam = [device, &seed](size_t outputDim, size_t inputDim) {
+        return CNTK::Parameter(CNTK::NDArrayView::RandomUniform<float>({ outputDim, inputDim }, -0.5, 0.5, seed++, device));
+    };
+
+    auto hProjWeights = createProjectionParam(outputDim[0], outputDim[0]);
+    auto inputProjWeights = createProjectionParam(outputDim[0], input.Shape()[0]);
+
+    auto output = Times(hProjWeights, recurrenceHook(dh)) + Times(inputProjWeights, input);
+    return output->ReplacePlaceholders({ { dh, output } });
+}
+
 inline std::vector<size_t> GenerateSequenceLengths(size_t numSequences, size_t maxAllowedSequenceLength)
 {
    std::vector<size_t> sequenceLengths(numSequences);
@ -208,13 +225,13 @@ inline std::vector<size_t> GenerateSequenceLengths(size_t numSequences, size_t m
 }

 template <typename ElementType>
-inline std::vector<std::vector<ElementType>> GenerateSequences(const std::vector<size_t>& sequenceLengths, size_t dim)
+inline std::vector<std::vector<ElementType>> GenerateSequences(const std::vector<size_t>& sequenceLengths, const CNTK::NDShape& sampleShape)
 {
    size_t numSequences = sequenceLengths.size();
    std::vector<std::vector<ElementType>> sequences;
    for (size_t i = 0; i < numSequences; ++i)
    {
-        std::vector<ElementType> currentSequence(dim * sequenceLengths[i]);
+        std::vector<ElementType> currentSequence(sampleShape.TotalSize() * sequenceLengths[i]);
        for (size_t j = 0; j < currentSequence.size(); ++j)
            currentSequence[j] = ((ElementType)rand()) / RAND_MAX;

@ -244,17 +261,21 @@ inline std::vector<std::vector<size_t>> GenerateOneHotSequences(const std::vecto
 }

 template <typename ElementType>
-inline CNTK::ValuePtr GenerateSequences(const std::vector<size_t>& sequenceLengths, size_t dim, const CNTK::DeviceDescriptor& device, bool oneHot)
+inline CNTK::ValuePtr GenerateSequences(const std::vector<size_t>& sequenceLengths, const CNTK::NDShape& sampleShape, const CNTK::DeviceDescriptor& device, bool oneHot)
 {
    if (!oneHot)
    {
-        std::vector<std::vector<ElementType>> sequences = GenerateSequences<ElementType>(sequenceLengths, dim);
-        return CNTK::Value::Create({ dim }, sequences, device, true);
+        std::vector<std::vector<ElementType>> sequences = GenerateSequences<ElementType>(sequenceLengths, sampleShape);
+        return CNTK::Value::Create(sampleShape, sequences, device, true);
    }
    else
    {
-        std::vector<std::vector<size_t>> oneHotSequences = GenerateOneHotSequences(sequenceLengths, dim);
-        return CNTK::Value::Create<ElementType>({ dim }, oneHotSequences, device, true);
+        if (sampleShape.Rank() != 1)
+            throw std::runtime_error("GenerateSequences can generate one hot sequences only for 1D sample shapes");
+
+        size_t vocabularySize = sampleShape[0];
+        std::vector<std::vector<size_t>> oneHotSequences = GenerateOneHotSequences(sequenceLengths, vocabularySize);
+        return CNTK::Value::Create<ElementType>(vocabularySize, oneHotSequences, device, true);
    }
 }

@ -296,3 +317,39 @@ inline void PrintTrainingProgress(const CNTK::Trainer& trainer, size_t minibatch
        printf("Minibatch %d: CrossEntropy loss = %.8g, Evaluation criterion = %.8g\n", (int)minibatchIdx, trainLossValue, evaluationValue);
    }
 }
+
+inline std::vector<size_t> GetStrides(const CNTK::NDShape& shape)
+{
+    std::vector<size_t> strides(shape.Rank() - 1);
+    size_t totalSize = 1;
+    for (size_t i = 0; i < shape.Rank() - 1; ++i)
+    {
+        totalSize *= shape[i];
+        strides[i] = totalSize;
+    }
+
+    return strides;
+}
+
+inline CNTK::NDShape UnflattenedShape(size_t flatennedIdx, const std::vector<size_t>& strides)
+{
+    CNTK::NDShape unflattenedShape(strides.size() + 1);
+    size_t remainder = flatennedIdx;
+    for (int i = (int)strides.size() - 1; i >= 0; --i)
+    {
+        unflattenedShape[i + 1] = remainder / strides[i];
+        remainder = remainder % strides[i];
+    }
+    unflattenedShape[0] = remainder;
+
+    return unflattenedShape;
+}
+
+inline size_t FlattenedIndex(const CNTK::NDShape& shape, const std::vector<size_t>& strides)
+{
+    size_t flattenedIdx = shape[0];
+    for (int i = 0; i < strides.size(); ++i)
+        flattenedIdx += shape[i + 1] * strides[i];
+
+    return flattenedIdx;
+};
--- a/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
@ -10,7 +10,7 @@ void TestReduceSum(const DeviceDescriptor& device)
    size_t dim = 23;

    auto sequenceLengths = GenerateSequenceLengths(numSequences, maxAllowedSequenceLength);
-    auto sequences = GenerateSequences<float>(sequenceLengths, dim);
+    auto sequences = GenerateSequences<float>(sequenceLengths, { dim });
    ValuePtr sequencesValue = Value::Create({ dim }, sequences, device, true);

    // Test ReduceSum along a static axis
@ -113,67 +113,88 @@ void TestReduceSum(const DeviceDescriptor& device)
    }
 }

-void TestSlice(const DeviceDescriptor& device)
+void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
 {
    size_t numSequences = 7;
    size_t maxAllowedSequenceLength = 11;
-    size_t dim = 23;
+    size_t maxDimSize = 23;
+    NDShape inputShape(sampleRank);
+    for (size_t i = 0; i < sampleRank; ++i)
+        inputShape[i] = (rand() % maxDimSize) + 1;

    auto sequenceLengths = GenerateSequenceLengths(numSequences, maxAllowedSequenceLength);
-    auto sequences = GenerateSequences<float>(sequenceLengths, dim);
-    ValuePtr sequencesValue = Value::Create({ dim }, sequences, device, true);
+    auto sequences = GenerateSequences<float>(sequenceLengths, inputShape);
+    ValuePtr sequencesValue = Value::Create(inputShape, sequences, device, true);

    // Test slice along a static axis
    {
-        auto testStaticAxisSlice = [&sequences, &sequenceLengths, dim, sequencesValue, device](int beginOffset, int endOffset)
+        auto testStaticAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](size_t sliceAxis, int beginOffset, int endOffset)
        {
-            size_t maxActualSequenceLength = sequencesValue->Data()->Shape()[1];
-            size_t numSequences = sequencesValue->Data()->Shape()[2];
+            size_t maxActualSequenceLength = sequencesValue->Data()->Shape()[inputShape.Rank()];
+            size_t numSequences = sequencesValue->Data()->Shape()[inputShape.Rank() + 1];

-            auto inputVar = InputVariable({ dim }, DataType::Float, L"input");
-            auto rowSliceFunc = Slice(inputVar, Axis(0), beginOffset, endOffset);
+            auto inputVar = InputVariable(inputShape, DataType::Float, L"input");
+            auto sliceFunc = Slice(inputVar, Axis(sliceAxis), beginOffset, endOffset);

-            NDShape outputShape = rowSliceFunc->Output().Shape().AppendShape({ maxActualSequenceLength, numSequences });
-            std::vector<float> outputData(outputShape.TotalSize());
-            ValuePtr outputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(outputShape, outputData, false), sequencesValue->Mask()->DeepClone());
+            NDShape outputShape = sliceFunc->Output().Shape();
+            auto outputDataShape = outputShape.AppendShape({ maxActualSequenceLength, numSequences });
+            std::vector<float> outputData(outputDataShape.TotalSize());
+            ValuePtr outputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(outputDataShape, outputData, false), sequencesValue->Mask()->DeepClone());

-            std::unordered_map<Variable, ValuePtr> outputs = { { rowSliceFunc->Output(), outputValue } };
-            rowSliceFunc->Forward({ { inputVar, sequencesValue } }, outputs, device);
+            std::unordered_map<Variable, ValuePtr> outputs = { { sliceFunc->Output(), outputValue } };
+            sliceFunc->Forward({ { inputVar, sequencesValue } }, outputs, device);

-            size_t rowSliceStartOffset = (beginOffset >= 0) ? beginOffset : (dim + beginOffset);
-            size_t sliceLength = endOffset - beginOffset;
-            std::vector<float> expectedOutputValues(sliceLength * maxActualSequenceLength * numSequences);
+            std::vector<size_t> inputShapeStrides = GetStrides(inputShape);
+            std::vector<size_t> outputShapeStrides = GetStrides(outputShape);
+
+            size_t sliceStartOffset = (beginOffset >= 0) ? beginOffset : (inputShape[sliceAxis] + beginOffset);
+            std::vector<float> expectedOutputValues(outputShape.TotalSize() * maxActualSequenceLength * numSequences);
            for (size_t i = 0; i < numSequences; ++i)
            {
                size_t currentSequenceLength = sequenceLengths[i];
                for (size_t j = 0; j < currentSequenceLength; ++j)
                {
-                    for (size_t k = 0; k < sliceLength; ++k)
-                        expectedOutputValues[(((i * maxActualSequenceLength) + j) * sliceLength) + k] = sequences[i][(j * dim) + k + rowSliceStartOffset];
+                    for (size_t k = 0; k < outputShape.TotalSize(); ++k)
+                    {
+                        auto outputIdx = UnflattenedShape(k, outputShapeStrides);
+                        auto inputIdx = outputIdx;
+                        inputIdx[sliceAxis] += sliceStartOffset;
+                        auto flatInputIdx = FlattenedIndex(inputIdx, inputShapeStrides);
+                        expectedOutputValues[(((i * maxActualSequenceLength) + j) * outputShape.TotalSize()) + k] = sequences[i][(j * inputShape.TotalSize()) + flatInputIdx];
+                    }
                }
            }

            FloatingPointVectorCompare(outputData, expectedOutputValues, "testStaticAxisSlice: Forward prop results do not match expected results");
        };

-        testStaticAxisSlice(3, 5);
-        testStaticAxisSlice(-1, 0);
-        testStaticAxisSlice(-3, -1);
+        size_t sliceAxis = 0;
+        testStaticAxisSlice(sliceAxis, 3, 5);
+
+        if (sliceAxis < (inputShape.Rank() - 1))
+            sliceAxis++;
+
+        testStaticAxisSlice(sliceAxis, -1, 0);
+
+        if (sliceAxis < (inputShape.Rank() - 1))
+            sliceAxis++;
+
+        testStaticAxisSlice(sliceAxis, - 3, -1);
    }

    // Test slice along a dynamic axis
    {
-        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, dim, sequencesValue, device](const Axis& axis, int beginOffset, int endOffset)
+        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](const Axis& axis, int beginOffset, int endOffset)
        {
            if (axis.IsStaticAxis())
                RuntimeError("Called the dynamic axis slice test with a static axis");

-            size_t maxActualSequenceLength = sequencesValue->Data()->Shape()[1];
-            size_t numSequences = sequencesValue->Data()->Shape()[2];
+            size_t maxActualSequenceLength = sequencesValue->Data()->Shape()[inputShape.Rank()];
+            size_t numSequences = sequencesValue->Data()->Shape()[inputShape.Rank() + 1];

            size_t sliceLength = endOffset - beginOffset;

-            auto inputVar = InputVariable({ dim }, DataType::Float, L"input");
+            auto inputVar = InputVariable(inputShape, DataType::Float, L"input");
            auto sliceFunc = Slice(inputVar, axis, beginOffset, endOffset);

            size_t outputSequenceAxisLength = (axis == Axis::DefaultDynamicAxis()) ? sliceLength : maxActualSequenceLength;
@ -188,7 +209,7 @@ void TestSlice(const DeviceDescriptor& device)
            size_t startSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((beginOffset >= 0) ? beginOffset : (numSequences + beginOffset)) : 0;
            size_t endSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((endOffset > 0) ? endOffset : (numSequences + endOffset)) : numSequences;

-            std::vector<float> expectedOutputValues(dim * outputSequenceAxisLength * outputBatchAxisLength);
+            std::vector<float> expectedOutputValues(inputShape.TotalSize() * outputSequenceAxisLength * outputBatchAxisLength);
            for (size_t i = startSequenceIdx; i < endSequenceIdx; ++i)
            {
                size_t currentSequenceLength = sequenceLengths[i];
@ -196,8 +217,8 @@ void TestSlice(const DeviceDescriptor& device)
                size_t endFrameIdx = (axis == Axis::DefaultDynamicAxis()) ? ((endOffset > 0) ? endOffset : (currentSequenceLength + endOffset)) : currentSequenceLength;
                for (size_t j = startFrameIdx; j < endFrameIdx; ++j)
                {
-                    for (size_t k = 0; k < dim; ++k)
-                        expectedOutputValues[((((i - startSequenceIdx) * outputSequenceAxisLength) + (j - startFrameIdx)) * dim) + k] = sequences[i][(j * dim) + k];
+                    for (size_t k = 0; k < inputShape.TotalSize(); ++k)
+                        expectedOutputValues[((((i - startSequenceIdx) * outputSequenceAxisLength) + (j - startFrameIdx)) * inputShape.TotalSize()) + k] = sequences[i][(j * inputShape.TotalSize()) + k];
                }
            }

@ -344,13 +365,59 @@ void TestRecurrentFunctionCloning()
    CompareFunctions(clonedFunctionWithParametersShared, clonedFunctionWithParametersFrozen, ParameterCloningMethod::Freeze, cloningReplacements, visitedFunctions);
 }

+void TestTranspose(size_t numAxes, size_t axis1, size_t axis2, const DeviceDescriptor& device)
+{
+    srand(1);
+
+    size_t maxDimSize = 15;
+    NDShape inputShape(numAxes);
+    for (size_t i = 0; i < numAxes; ++i)
+        inputShape[i] = (rand() % maxDimSize) + 1;
+
+    auto inputVar = InputVariable(inputShape, DataType::Float, false, L"leftInput");
+    auto transposeFunc = TransposeAxes(inputVar, Axis(axis1), Axis(axis2));
+
+    std::vector<float> inputData(inputShape.TotalSize());
+    for (size_t i = 0; i < inputData.size(); ++i)
+        inputData[i] = ((float)rand()) / RAND_MAX;
+
+    auto inputValueShape = inputShape.AppendShape({ 1, 1 });
+    ValuePtr inputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(inputValueShape, inputData, true));
+
+    NDShape outputShape = transposeFunc->Output().Shape();
+    NDShape outputValueShape = outputShape.AppendShape({ 1, 1 });
+    std::vector<float> outputData(outputValueShape.TotalSize());
+    ValuePtr outputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(outputValueShape, outputData, false));
+
+    std::unordered_map<Variable, ValuePtr> outputs = { { transposeFunc->Output(), outputValue } };
+    transposeFunc->Forward({ { inputVar, inputValue } }, outputs, device);
+
+    std::vector<size_t> inputShapeStrides = GetStrides(inputShape);
+    std::vector<size_t> outputShapeStrides = GetStrides(outputShape);
+
+    // Verify forward prop results
+    std::vector<float> expectedOutputValues(outputShape.TotalSize());
+    for (size_t i = 0; i < expectedOutputValues.size(); ++i)
+    {
+        auto unflattenedShape = UnflattenedShape(i, outputShapeStrides);
+        std::swap(unflattenedShape[axis1], unflattenedShape[axis2]);
+        size_t flattenedIndex = FlattenedIndex(unflattenedShape, inputShapeStrides);
+        expectedOutputValues[i] = inputData[flattenedIndex];
+    }
+
+    FloatingPointVectorCompare(outputData, expectedOutputValues, "TestTimesAndPlus: Forward prop results do not match expected results");
+}
+
 void FunctionTests()
 {
-    TestSlice(DeviceDescriptor::CPUDevice());
-    TestSlice(DeviceDescriptor::GPUDevice(0));
+    TestSlice(2, DeviceDescriptor::CPUDevice());
+    TestSlice(1, DeviceDescriptor::GPUDevice(0));

    TestReduceSum(DeviceDescriptor::CPUDevice());
    TestReduceSum(DeviceDescriptor::GPUDevice(0));

    TestRecurrentFunctionCloning();
+
+    TestTranspose(2, 0, 1, DeviceDescriptor::CPUDevice());
+    TestTranspose(3, 1, 2, DeviceDescriptor::GPUDevice(0));
 }
--- a/Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp
@ -80,7 +80,7 @@ void TestRecurrentNetworkCreation(const DeviceDescriptor& device, bool testSaveA
    {
        std::vector<size_t> sequenceLengths = GenerateSequenceLengths(numSequences, maxAllowedSequenceLength);
        
-        ValuePtr inputValue = GenerateSequences<ElementType>(sequenceLengths, inputDim, device, false);
+        ValuePtr inputValue = GenerateSequences<ElementType>(sequenceLengths, { inputDim }, device, false);

        std::vector<std::vector<ElementType>> labelsData;
        for (size_t i = 0; i < numSequences; ++i)
--- a/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
@ -59,6 +59,9 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    {
        thoughtVectorH = Reshape(thoughtVectorH, thoughtVectorH->Output().Shape().AppendShape({ 1 }));
        thoughtVectorC = Reshape(thoughtVectorC, thoughtVectorC->Output().Shape().AppendShape({ 1 }));
+
+        labelEmbedding = Reshape(labelEmbedding, labelEmbedding->Output().Shape().AppendShape({ 1 }));
+        labelSentenceStartEmbeddedScattered = Reshape(labelSentenceStartEmbeddedScattered, labelSentenceStartEmbeddedScattered->Output().Shape().AppendShape({ 1 }));
    }

    auto thoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding);
@ -179,6 +182,6 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
 void TrainSequenceToSequenceTranslator()
 {
    // TODO: Also test with sparse input variables in the graph
-    TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, false);
    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, true);
+    TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, false);
 }