Merge fp16 brainscript work (#3606)

* FP16 BrainScript - address code review comments * Remove Tab and fix debug build breaks * Fix Linux Build breaks * fp16 brain script - add _CRT_SECURE_NO_WARNINGS * fp16 brain script - fix NetworkTests * Update tests for model version change * Remove changes for InputAndParamNodes * Fix typo * Remove redundant code * Fix optional parameters
2019-03-20 11:36:16 -07:00 · 2019-03-20 11:36:16 -07:00 · 4003c087a1
--- a/1
+++ b/1
@ -707,6 +707,7 @@ SGDLIB_SRC=\
 	$(SOURCEDIR)/SGDLib/Profiler.cpp \
 	$(SOURCEDIR)/SGDLib/SGD.cpp \
 	$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
+	$(SOURCEDIR)/SGDLib/SimpleDistGradAggregatorHelper.cpp \

 SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC)

--- a/Source/1BitSGD/BlockMomentumSGD.h
+++ b/Source/1BitSGD/BlockMomentumSGD.h
@ -94,15 +94,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                );
        }
        /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes, 
-            std::list<Matrix<ElemType>>&                smoothedGradient,
+            std::list<MatrixBasePtr>&                   smoothedGradients,
            size_t                                      samplesSinceLastSync) override
        {
-            Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync);
+            Base::OnEpochEnd(LearnableNodes, smoothedGradients, samplesSinceLastSync);
        }
        /*virtual*/ void ModelAggregationProcessing(
            size_t samplesSinceLastSync,
            const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
            size_t& totalSamplesProcessed,
            float& secondsOnCommunication
            ) override
@ -181,9 +181,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //----------------------------------------
            if (m_resetSGDMomentumAfterAggregation)
            {
-                for (Matrix<ElemType>& x : smoothedGradient)
+                for (auto sg : smoothedGradients)
                {
-                    x.SetValue((ElemType)0);
+                    auto x = dynamic_pointer_cast<Matrix<ElemType>>(sg);
+                    if (x != nullptr)
+                        x->SetValue((ElemType)0);
                }
            }
        }
--- a/Source/1BitSGD/V2BlockMomentumSGD.h
+++ b/Source/1BitSGD/V2BlockMomentumSGD.h
@ -108,7 +108,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        bool OnArrivingAtSyncPoint(
            const std::list<ComputationNodeBasePtr>& learnableNodes,        /* input/output: */
-            std::list<Matrix<ElemType>>& smoothedGradient,                  /* input/output: under some setup, it will reset to zero*/
+            std::list<MatrixBasePtr>& smoothedGradients,                    /* input/output: under some setup, it will reset to zero*/
            size_t  samplesSinceLastSync                                    /* input:  samples processed since last sync on this worker only */
            ) override
        {
@ -130,12 +130,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // Otherwise let update the weights.
            float secondsOnCommunication = 0.0f;
            size_t totalSamples = 0;
-            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
            return true;
        }

        /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
            size_t samplesSinceLastSync) override
        {
            if (!m_someWorkerHasFinished)
@ -152,13 +152,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // Let's update our weights no matter what.
            float secondsOnCommunication = 0.0f;
            size_t totalSamples = 0;
-            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
        }

        /*virtual*/ void ModelAggregationProcessing(
            size_t /*samplesSinceLastSync*/,
            const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
            size_t&                                   /*totalSamplesProcessed*/,   /* out */
            float&                                    secondsOnCommunication   /* out */
            ) override
@ -196,8 +196,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers());

            // 2. Let's update the model
-            for (auto& pBaseNode : learnableNodes)
+            auto smoothedGradientIter = smoothedGradients.begin();
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
            {
+                ComputationNodeBasePtr pBaseNode = *nodeIter;
                if (!pBaseNode->IsParameterUpdateRequired())
                    continue;

@ -235,15 +237,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // 2.2.4 update bookkeeping
                    prevWeight.SetValue(currentWeight);
                }
-            }
+
                //----------------------------------------
                // 3. reset SGD momentum if necessary 
                //----------------------------------------
+                {
+                    // For half, we keep a copy of float weights, update that too
+                    if (std::is_same<ElemType, half>())
+                    {
+                        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
+                        size_t numCols = currentWeight.GetNumCols();
+
+                        auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+                        parameterMatrix.CastAssignValuesOf(currentWeight);
+
                        if (m_resetSGDMomentumAfterAggregation)
                        {
-                for (Matrix<ElemType>& x : smoothedGradient)
+                            // Only reset smoothed gradients
+                            auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
+                            smoothedGradientMatrix.SetValue(0.0f);
+                        }
+                    }
+                    else
                    {
-                    x.SetValue((ElemType)0);
+                        if (m_resetSGDMomentumAfterAggregation)
+                        {
+                            auto x = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
+                            x->SetValue((ElemType)0);
+                        }
+                    }
                }
            }
        }
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -22,6 +22,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 using namespace std;

+template <class ElemType, class TargetType>
+static inline bool isprecision(std::wstring& str)
+{
+    if ((str == L"") && std::is_same<ElemType, TargetType>())
+        return true;
+    if (std::is_same<TargetType, half>())
+        return EqualCI(str, L"float16");
+    else if (std::is_same<TargetType, float>())
+        return EqualCI(str, L"float");
+    else if (std::is_same<TargetType, double>())
+        return EqualCI(str, L"double");
+    return false;
+}
+
 template <class ElemType>
 void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
 {
@ -48,7 +62,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst

    std::wstring cnNodeType = Microsoft::MSR::CNTK::ToFixedWStringFromMultiByte(node->GetValue());

-    ComputationNodePtr nodePtr;
+    ComputationNodeBasePtr nodePtr;

    // get the node pointer for the node, should be stored in the EvalValue;
    if (pass > ndlPassInitial)
@ -56,7 +70,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
        nodePtr = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
        if (!nodePtr)
        {
-            nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
+            nodePtr = m_net->GetNodeFromName(name);
            node->SetEvalValue(nodePtr.get());
        }
    }
@ -75,16 +89,49 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);

            wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
+            wstring precision = node->GetOptionalParameter("precision", "");
+
            // TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
            // first look for this node already existing in the network
            // BUGBUG: How does this set the dimensions then?
            if (m_net->NodeNameExists(name))
-                nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
-            else if (isSparse)
+                nodePtr = m_net->GetNodeFromName(name);
+            else
+            {
+                if (precision == L"")
+                {
+                    if (isSparse)
                        nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
                    else
                        nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
                }
+                else if (EqualCI(precision, L"float"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<float>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<float>(name, tensorShape, dynamicAxis);
+                }
+                else if (EqualCI(precision, L"double"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<double>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<double>(name, tensorShape, dynamicAxis);
+                }
+                else if (EqualCI(precision, L"float16"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<half>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<half>(name, tensorShape, dynamicAxis);
+                }
+                else
+                {
+                    RuntimeError("NDLNetworkBuilder: Input: the 'precision' parameter if specified, must be 'float', 'double' or 'float16'.");
+                }
+            }
+        }
    }
    else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
    {
@ -193,7 +240,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            nodePtr = builder.CreateLearnableParameter(name, rows, cols);
            nodePtr->SetLearningRateMultiplier(0);
        }
-        else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
+        else if (pass == ndlPassFinal || (dynamic_pointer_cast<ComputationNode<ElemType>> (nodePtr))->Value().GetNumElements() != 0)
        {
            ElemType val = parameter[0]->GetScalar();
            m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
@ -607,6 +654,56 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            nodeParamCount = nodePtr->GetNumInputs();
        }
    }
+    else if (cnNodeType == OperationName2Of(CastNode))
+    {
+        if (parameter.size() < 1)
+            RuntimeError("%ls should have 1 or more parameters (node and cast precision).", cnNodeType.c_str());
+
+        // setup the parameter position of children so we can hook them up later
+        nodeParamCount = 1;
+        nodeParamStart = 0;
+
+        if (pass == ndlPassInitial)
+        {
+            // evaluate only scalar parameters
+            vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+            auto sourceNode = (NDLNode<ElemType>*) params[0];
+            wstring sourcePrecision = sourceNode->GetOptionalParameter("precision", "");
+            wstring targetPrecision = node->GetOptionalParameter("precision", "");
+            if (EqualCI(targetPrecision, L"float16"))
+            {
+                ComputationNetworkBuilder<half> builder2(*m_net);
+                if (isprecision<ElemType, float>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<float>(name);
+                else if (isprecision<ElemType, double>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<double>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to half, input must be  'float' or 'double'");
+            }
+            else if (EqualCI(targetPrecision, L"float"))
+            {
+                ComputationNetworkBuilder<float> builder2(*m_net);
+                if (isprecision<ElemType, half>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<half>(name);
+                else if (isprecision<ElemType, double>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<double>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to float, input must be  'float16' or 'double'");
+            }
+            else if (EqualCI(targetPrecision, L"double"))
+            {
+                ComputationNetworkBuilder<double> builder2(*m_net);
+                if (isprecision<ElemType, float>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<float>(name);
+                else if (isprecision<ElemType, half>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<half>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to double, input must be  'float' or 'float16'");
+            }
+            else
+                RuntimeError("NDLNetworkBuilder: CastNode - need to specify 'precision' parameter: 'float', 'double' or 'float16'.");
+        }
+    }
    else
    {

@ -645,7 +742,10 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
 #if 1
            vector<ComputationNodeBasePtr> inputNodes;
            for (let& in : inputs)
-                inputNodes.push_back(ComputationNode<ElemType>::FromVoidPtr(in));
+            {
+                ComputationNodeBase *p = (ComputationNodeBase *)in;
+                inputNodes.push_back(p ? p->shared_from_this() : nullptr);
+            }

            nodePtr->AttachInputs(inputNodes);
 #else       // TODO: delete this
@ -714,6 +814,7 @@ TensorShape NDLNodeEvaluatorImpl<ElemType>::ProcessTensorShapeParameters(const N
    return TensorShape(dims);
 }

+template class NDLBuilderImpl<half>;
 template class NDLBuilderImpl<float>;
 template class NDLBuilderImpl<double>;

--- a/Source/ActionsLib/NDLNetworkBuilder.h
+++ b/Source/ActionsLib/NDLNetworkBuilder.h
@ -269,10 +269,11 @@ public:
    }

    // ProcessOptionalParameters - Process the optional parameters of a node
-    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
+    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node) override
    {
        vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
-        auto compNode = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
+        ComputationNodeBase* compNodePtr = (ComputationNodeBase *) (node->GetEvalValue());
+        ComputationNodeBasePtr compNode = compNodePtr ? compNodePtr->shared_from_this() : nullptr;
        std::string empty;

        // loop through all the optional parameters processing them as necessary
@ -582,6 +583,7 @@ private:
    DEVICEID_TYPE m_deviceId;
 };

+template class NDLBuilder<half>;
 template class NDLBuilder<float>;
 template class NDLBuilder<double>;

--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(AtanhNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(BatchNormalizationNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationName2Of(CastNode))) ret = true;
 #ifdef COMING_SOON
    else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
 #endif
@ -267,18 +268,24 @@ NDLScript<ElemType> NDLScript<ElemType>::s_global("global");

 // declare the static variables from the classes
 template <>
+NDLScript<half> NDLScript<half>::s_global{};
+template <>
 NDLScript<float> NDLScript<float>::s_global{};
 template <>
 NDLScript<double> NDLScript<double>::s_global{};

+template <>
+int NDLNode<half>::s_nameCounter = 0;
 template <>
 int NDLNode<float>::s_nameCounter = 0;
 template <>
 int NDLNode<double>::s_nameCounter = 0;

+template class NDLNode<half>;
 template class NDLNode<float>;
 template class NDLNode<double>;

+template class NDLScript<half>;
 template class NDLScript<float>;
 template class NDLScript<double>;

--- a/Source/ActionsLib/NetworkDescriptionLanguage.h
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.h
@ -98,6 +98,7 @@ public:
    }
 };

+template class NDLNodeEvaluator<half>;
 template class NDLNodeEvaluator<float>;
 template class NDLNodeEvaluator<double>;

--- a/Source/ActionsLib/NetworkFactory.cpp
+++ b/Source/ActionsLib/NetworkFactory.cpp
@ -188,9 +188,12 @@ ComputationNetworkPtr GetModelFromConfig(const ConfigRecordType& config, const w
    return net;
 }

+template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
+template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, half>(const ConfigParameters& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, float>(const ConfigParameters& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, double>(const ConfigParameters& config);
+template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, half> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
 template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, float> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
 template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, double>(const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
--- a/Source/ActionsLib/SimpleNetworkBuilder.cpp
+++ b/Source/ActionsLib/SimpleNetworkBuilder.cpp
@ -1775,6 +1775,7 @@ shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAn
    return output;
 }

+template class SimpleNetworkBuilder<half>;
 template class SimpleNetworkBuilder<float>;
 template class SimpleNetworkBuilder<double>;

--- a/Source/ActionsLib/SimpleNetworkBuilder.h
+++ b/Source/ActionsLib/SimpleNetworkBuilder.h
@ -159,9 +159,13 @@ public:
        m_constInputGateValue  = config("constInputGateValue",  "false");
        m_constOutputGateValue = config("constOutputGateValue", "false");

-        m_forgetGateInitVal = config("forgetGateInitVal", "-1");
-        m_inputGateInitVal  = config("inputGateInitVal",  "-1");
-        m_outputGateInitVal = config("outputGateInitVal", "-1");
+        ElemType forgetGateInitVal = config("forgetGateInitVal", "-1");
+        ElemType inputGateInitVal = config("inputGateInitVal", "-1");
+        ElemType outputGateInitVal = config("outputGateInitVal", "-1");
+
+        m_forgetGateInitVal = forgetGateInitVal;
+        m_inputGateInitVal = inputGateInitVal;
+        m_outputGateInitVal = outputGateInitVal;

        m_sparse_input = config("sparseinput", "false");

--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@ -142,12 +142,14 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
 {
    const IConfigRecord& config = *configp;
    wstring precision = config[L"precision"]; // dispatch on ElemType
-    if (precision == L"float")
+    if (precision == L"float16")
+        DoTrain<IConfigRecord, half>(config);
+    else if (precision == L"float")
        DoTrain<IConfigRecord, float>(config);
    else if (precision == L"double")
        DoTrain<IConfigRecord, double>(config);
    else
-        RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
+        RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());

    return make_shared<Object>(); // return a dummy object
 }
@ -156,8 +158,10 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<TrainAction> registerTrainAction(L"TrainAction");
 }}}

+template void DoTrain<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
 template void DoTrain<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
 template void DoTrain<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
+template void DoTrain<ConfigParameters, half>(const ConfigParameters& config);
 template void DoTrain<ConfigParameters, float>(const ConfigParameters& config);
 template void DoTrain<ConfigParameters, double>(const ConfigParameters& config);

--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -171,6 +171,91 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con
 // be run in parallel across multiple ranks. Others should only run on rank 0
 const std::set<std::string> commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" };

+
+template <typename ElemType>
+bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& config)
+{
+    if (thisAction == "train" || thisAction == "trainRNN")
+    {
+        DoTrain<ConfigParameters, ElemType>(commandParams);
+    }
+    else if (thisAction == "bnstat")
+    {
+        DoBatchNormalizationStat<ElemType>(commandParams);
+    }
+    else if (thisAction == "adapt")
+    {
+        DoAdapt<ElemType>(commandParams);
+    }
+    else if (thisAction == "test" || thisAction == "eval")
+    {
+        DoEval<ElemType>(commandParams);
+    }
+    else if (thisAction == "edit")
+    {
+        DoEdit<ElemType>(commandParams);
+    }
+    else if (thisAction == "cv")
+    {
+        DoCrossValidate<ElemType>(commandParams);
+    }
+    else if (thisAction == "write")
+    {
+        DoWriteOutput<ElemType>(commandParams);
+    }
+    else if (thisAction == "devtest")
+    {
+        TestCn<ElemType>(config); // for "devtest" action pass the root config instead
+    }
+    else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
+    {
+        DoDumpNodes<ElemType>(commandParams);
+    }
+    else if (thisAction == "convertdbn")
+    {
+        DoConvertFromDbn<ElemType>(commandParams);
+    }
+    else if (thisAction == "exportdbn")
+    {
+        DoExportToDbn<ElemType>(commandParams);
+    }
+    else if (thisAction == "createLabelMap")
+    {
+        DoCreateLabelMap<ElemType>(commandParams);
+    }
+    else if (thisAction == "writeWordAndClass")
+    {
+        DoWriteWordAndClassInfo<ElemType>(commandParams);
+    }
+    else if (thisAction == "plot")
+    {
+        DoTopologyPlot<ElemType>(commandParams);
+    }
+    else if (thisAction == "SVD")
+    {
+        DoParameterSVD<ElemType>(commandParams);
+    }
+    else
+    {
+        return false;
+    }
+    return true;
+}
+
+template <>
+bool DispatchThisAction<half>(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& )
+{
+    if (thisAction == "train" || thisAction == "trainRNN")
+    {
+        DoTrain<ConfigParameters, half>(commandParams);
+    }
+    else
+    {
+        RuntimeError("half only supported for action train or trainRNN!");
+    }
+    return true;
+}
+
 // process the command
 template <typename ElemType>
 void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mpi)
@ -270,73 +355,21 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
                    {
                        LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
                    }
-                    DoTrain<ConfigParameters, ElemType>(commandParams);
+                }
+
+                if (!DispatchThisAction<ElemType>(thisAction, commandParams, config))
+                {
+                    RuntimeError("unknown action: %s  in command set: %s", thisAction.c_str(), command[i].c_str());
+                }
+
+                if (thisAction == "train" || thisAction == "trainRNN")
+                {
                    if (progressTracing)
                    {
                        LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
                    }
                    fullEpochsOffset += GetMaxEpochs(commandParams);
                }
-                else if (thisAction == "bnstat")
-                {
-                    DoBatchNormalizationStat<ElemType>(commandParams);
-                }
-                else if (thisAction == "adapt")
-                {
-                    DoAdapt<ElemType>(commandParams);
-                }
-                else if (thisAction == "test" || thisAction == "eval")
-                {
-                    DoEval<ElemType>(commandParams);
-                }
-                else if (thisAction == "edit")
-                {
-                    DoEdit<ElemType>(commandParams);
-                }
-                else if (thisAction == "cv")
-                {
-                    DoCrossValidate<ElemType>(commandParams);
-                }
-                else if (thisAction == "write")
-                {
-                    DoWriteOutput<ElemType>(commandParams);
-                }
-                else if (thisAction == "devtest")
-                {
-                    TestCn<ElemType>(config); // for "devtest" action pass the root config instead
-                }
-                else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
-                {
-                    DoDumpNodes<ElemType>(commandParams);
-                }
-                else if (thisAction == "convertdbn")
-                {
-                    DoConvertFromDbn<ElemType>(commandParams);
-                }
-                else if (thisAction == "exportdbn")
-                {
-                    DoExportToDbn<ElemType>(commandParams);
-                }
-                else if (thisAction == "createLabelMap")
-                {
-                    DoCreateLabelMap<ElemType>(commandParams);
-                }
-                else if (thisAction == "writeWordAndClass")
-                {
-                    DoWriteWordAndClassInfo<ElemType>(commandParams);
-                }
-                else if (thisAction == "plot")
-                {
-                    DoTopologyPlot<ElemType>(commandParams);
-                }
-                else if (thisAction == "SVD")
-                {
-                    DoParameterSVD<ElemType>(commandParams);
-                }
-                else
-                {
-                    RuntimeError("unknown action: %s  in command set: %s", thisAction.c_str(), command[i].c_str());
-                }
            }

            fprintf(stderr, "\n");
@ -740,12 +773,14 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
        LOGPRINTF(stderr, "precision = \"%s\"\n", type.c_str());
    }

-    if (type == "float")
+    if (type == "float16")
+        DoCommands<half>(config, mpi);
+    else if (type == "float")
        DoCommands<float>(config, mpi);
    else if (type == "double")
        DoCommands<double>(config, mpi);
    else
-        RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float\" or \"double\"", type.c_str());
+        RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float16\" or \"float\" or \"double\"", type.c_str());

    // if completed then write a doneFile if requested
    if (!doneFile.empty())
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@ -8,6 +8,8 @@
 #include <map>
 #include <stdexcept>
 #include <stdint.h>
+#include "File.h"
+#include "half.hpp"

 using namespace std;

@ -150,6 +152,11 @@ public:
        return (float) (double) *this;
    }

+    operator half() const
+    {
+        return (half)(double)*this;
+    }
+
 private:
    long tolong() const
    {
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -150,6 +150,20 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
        else if (nodePtr->Is<ComputationNode<half>>())
            precision = ElemTypeName<half>();
        else LogicError("Unexpected node type.");
+#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
+        if (nodePtr->Is<CastNode<half,float>>())
+            precision = ElemTypeName2<half,float>();
+        else if (nodePtr->Is<CastNode<half, double>>())
+            precision = ElemTypeName2<half, double>();
+        else if (nodePtr->Is<CastNode<float, half>>())
+            precision = ElemTypeName2<float, half>();
+        else if (nodePtr->Is<CastNode<float, double>>())
+            precision = ElemTypeName2<float, double>();
+        else if (nodePtr->Is<CastNode<double, half>>())
+            precision = ElemTypeName2<double, half>();
+        else if (nodePtr->Is<CastNode<double, float>>())
+            precision = ElemTypeName2<double, float>();
+#endif
        fstream << precision;
 #endif
        fstream << nodePtr->OperationName();
@ -265,6 +279,20 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
            node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
        else if (precision == L"") // old file format: default to <ElemType>
            node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
+#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
+        else if (precision == L"half,float")
+            node = ComputationNetworkBuilder<half>::NewNode2<float>(opName, m_deviceId, nodeName);
+        else if (precision == L"half,double")
+            node = ComputationNetworkBuilder<half>::NewNode2<double>(opName, m_deviceId, nodeName);
+        else if (precision == L"float,half")
+            node = ComputationNetworkBuilder<float>::NewNode2<half>(opName, m_deviceId, nodeName);
+        else if (precision == L"float,double")
+            node = ComputationNetworkBuilder<float>::NewNode2<double>(opName, m_deviceId, nodeName);
+        else if (precision == L"double,half")
+            node = ComputationNetworkBuilder<double>::NewNode2<half>(opName, m_deviceId, nodeName);
+        else if (precision == L"double,float")
+            node = ComputationNetworkBuilder<double>::NewNode2<float>(opName, m_deviceId, nodeName);
+#endif
        else
            RuntimeError("Read: Unexpected precision tag '%ls'", precision.c_str());

--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -1313,6 +1313,14 @@ template <> /*static*/ inline const wchar_t* ElemTypeName<float>()  { return L"f
 template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
 template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }

+template <typename ElemType, typename ElemType2> static inline const wchar_t* ElemTypeName2();
+template <> /*static*/ inline const wchar_t* ElemTypeName2<float,half>() { return L"float,half"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<float,double>() { return L"float,double"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<double,half>() { return L"double,half"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<double,float>() { return L"double,float"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<half,float>() { return L"half,float"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<half,double>() { return L"half,double"; }
+
 // The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
 // The corresponding Matrix<float> is emitted in the SetDeviceId function above.
 template class Matrix<double>;
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -175,6 +175,13 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
    else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
 }

+template <class ElemType, class ElemType2, class... _Types>
+static shared_ptr<ComputationNode<ElemType>> CreateNode2(const std::wstring& nodeType, _Types&&... _Args)
+{
+    // check more types
+    if (nodeType == OperationName2Of(CastNode))       return New<CastNode<ElemType, ElemType2>>(forward<_Types>(_Args)...);
+    else RuntimeError("CreateNode2: unsupported nodeType - %S", nodeType.c_str());
+}
 // this function is called from SimpleNetworkBuilder and old NDL
 template <class ElemType>
 /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
@ -189,6 +196,13 @@ template <class ElemType>
    return CreateNode<ElemType>(nodeType, deviceId, name);
 }

+template <class ElemType>
+template <class ElemType2>
+/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
+{
+    return CreateNode2<ElemType, ElemType2>(nodeType, deviceId, name);
+}
+
 shared_ptr<ComputationNodeBase> NewComputationNodeFromConfig(const Microsoft::MSR::ScriptableObjects::IConfigRecordPtr configp)
 {
    wstring precision = configp->Get(L"precision"); // dispatch on ElemType
@ -247,15 +261,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
+template <class ValueType>
+shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
+    return net.AddNodeToNetWithElemType(New<InputValue<ValueType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
+template <class ValueType>
+shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
+    return net.AddNodeToNetWithElemType(New<SparseInputValue<ValueType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
 }

 template <class ElemType>
@ -318,6 +334,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
 {
    return net.AddNodeToNetWithElemType(New<ReconcileDynamicAxisNode<ElemType>>(net.GetDeviceId(), nodeName));
 }
+template <class ElemType>
+template <class InputNodeType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateCastNode(const std::wstring& nodeName)
+{
+    return net.AddNodeToNetWithElemType(New<CastNode<ElemType, InputNodeType>>(net.GetDeviceId(), nodeName));
+}

 // this is the catch-all for all cases not covered as special cases above
 // Unlike the specialized ones above, this one creates nodes by type given as a string.
@ -997,4 +1019,37 @@ template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::Typ
 template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
 template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);

+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<half>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<double>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<half>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<float>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<float>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<double>(const std::wstring& nodeName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -38,6 +38,8 @@ public:
    // TODO: move into a separate header/class, to decouple from this class which would then be only used by old NDL and SimpleNetworkBuilder.
    static ComputationNodePtr NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
    static ComputationNodePtr NewNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
+    template <class ElemType2>
+    static ComputationNodePtr NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);

    // The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
    // There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
@ -53,12 +55,25 @@ public:
    template<class ValueType>
    shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2

+    template <class InputNodeType>
+    shared_ptr<ComputationNode<ElemType>> CreateCastNode(const std::wstring& nodeName);
+
    // sparse matrix size is optionally specified
    // ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
-    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
-    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
+    shared_ptr<ComputationNode<ElemType>> CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
+    {
+        return this->template TypedCreateInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
+    }
+    template<class ValueType>
+    shared_ptr<ComputationNode<ValueType>> TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+    shared_ptr<ComputationNode<ElemType>> CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
+    {
+        return this->template TypedCreateSparseInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
+    }
+    template<class ValueType>
+    shared_ptr<ComputationNode<ValueType>> TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                             const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                                             bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -61,7 +61,8 @@
 #define CNTK_MODEL_VERSION_28 28 // Padding op
 #define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS
 #define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30
+#define CNTK_MODEL_VERSION_31 31 // Cast node
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_31

 // helper mode for debugging
 // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
@ -95,6 +96,7 @@ struct /*interface*/ IComputationNode
    // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
    virtual const std::wstring OperationName() const = 0;
 #define OperationNameOf(T) (T<float>::TypeName()) // convenience macro
+#define OperationName2Of(T) (T<double,float>::TypeName()) // convenience macro

    virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps.

--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -4,6 +4,9 @@
 //
 // CNTKEval.cpp : Defines the exported functions for the CNTK DLL.
 //
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif

 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
--- a/Source/SGDLib/ASGDHelper.cpp
+++ b/Source/SGDLib/ASGDHelper.cpp
@ -646,6 +646,20 @@ ASGDHelper<ElemType>* NewASGDHelper(
 #endif
 }

+template<> ASGDHelper<half>* NewASGDHelper<half>(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,
+    size_t nodeNumRanks,
+    bool useAsyncBuffer,
+    bool isSimulatedModelAveragingSGD,
+    AdjustLearningRateAtBeginning adjusttype,
+    double adjustCoef,
+    size_t adjustPerMinibatches,
+    int traceLevel,
+    int syncPerfStats)
+{
+    RuntimeError("NewASGDHelper - half not supported!");
+}
+
 template ASGDHelper<float>* NewASGDHelper<float>(
    const std::list<ComputationNodeBasePtr> & learnableNodes,
    size_t nodeNumRanks,
--- a/Source/SGDLib/AccumulatorAggregation.h
+++ b/Source/SGDLib/AccumulatorAggregation.h
@ -18,6 +18,7 @@
 #include "Matrix.h"
 #include "SimpleDistGradAggregator.h"
 #include "V2SimpleDistGradAggregator.h"
+#include "SimpleDistGradAggregatorHelper.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -46,16 +47,7 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
    }

    // Prepare aggregator.
-    std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg;
-    if (Globals::UseV2Aggregator())
-        distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(
-            mpi,
-            false /*useAsyncAggregation*/,
-            net->GetDeviceId(),
-            0 /*syncStatsTrace*/,
-            ::CNTK::MPICommunicator(packThresholdSizeInBytes));
-    else
-        distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(
+    std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg = GetSimpleDistGradAggregator<ElemType>(
        mpi,
        false /*useAsyncAggregation*/,
        net->GetDeviceId(),
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@ -24,10 +24,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        for (const auto& iter : inputMatrices)
            matrices.insert(iter.second.matrix);
        for (auto& node : net->FeatureNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+            if (matrices.find(node->ValuePtr()) != matrices.end())
                node->NotifyFunctionValuesMBSizeModified();
        for (auto& node : net->LabelNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+            if (matrices.find(node->ValuePtr()) != matrices.end())
                node->NotifyFunctionValuesMBSizeModified();
    }

--- a/Source/SGDLib/MASGD.h
+++ b/Source/SGDLib/MASGD.h
@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }

         virtual void OnEpochEnd(const std::list<ComputationNodeBasePtr>&    LearnableNodes,
-                                    std::list<Matrix<ElemType>>&                smoothedGradient, 
+                                    std::list<MatrixBasePtr>&                   smoothedGradients,
                                    size_t                                      samplesSinceLastSync 
                                    )
         {
@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (read2sync)
             {
                 m_numSyncPerformed++;
-                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
+                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
                 m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
             }
             
@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

         virtual bool OnArrivingAtSyncPoint(
            const std::list<ComputationNodeBasePtr>& LearnableNodes,        /* input/output: */
-            std::list<Matrix<ElemType>>& smoothedGradient,                  /* input/output: under some setup, it will reset to zero*/
+            std::list<MatrixBasePtr>& smoothedGradients,                     /* input/output: under some setup, it will reset to zero*/
            size_t  samplesSinceLastSync                                    /* input:  samples processed since last sync on this worker only */
             )
         {
@ -190,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (read2Sync)
             {
                 m_numSyncPerformed++;
-                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
+                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
                 m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
             }
             return read2Sync;
@ -199,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void ModelAggregationProcessing(
             size_t samplesSinceLastSync,                                       /* in: */
             const std::list<ComputationNodeBasePtr>&  learnableNodes,          /* in/out */
-             std::list<Matrix<ElemType>>&              smoothedGradient,        /* in/out */
+             std::list<MatrixBasePtr>&                 smoothedGradients,       /* in/out */
             size_t&                                   totalSamplesProcessed,   /* out */
             float&                                    secondsOnCommunication   /* out */) = 0; 
         
@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void ModelAggregationProcessing(
            size_t samplesSinceLastSync,                                       /* in */
            const std::list<ComputationNodeBasePtr>&  learnableNodes,          /* in/out */
-            std::list<Matrix<ElemType>>&              smoothedGradient,        /* in/out */
+            std::list<MatrixBasePtr>&                 smoothedGradients,       /* in/out */
            size_t&                                   totalSamplesProcessed,   /* out */
            float&                                    secondsOnCommunication   /* out */) override
            // NOTE: the variable type is determined by the interface in SGD::TrainOneEpoch
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -31,6 +31,7 @@
 #include "ASGDHelper.h"

 #include "CNTKLibraryInternals.h"
+#include "SimpleDistGradAggregatorHelper.h"
 #include "SimpleDistGradAggregator.h"
 #include "V2SimpleDistGradAggregator.h"
 #include "ProgressTracing.h"
@ -47,8 +48,10 @@ using namespace std;
 // class SGD
 // =======================================================================

+template SGD<half>::SGD(const ConfigParameters&);
 template SGD<float>::SGD(const ConfigParameters&);
 template SGD<double>::SGD(const ConfigParameters&);
+template SGD<half>::SGD(const ScriptableObjects::IConfigRecord&);
 template SGD<float>::SGD(const ScriptableObjects::IConfigRecord&);
 template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);

@ -223,6 +226,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
        }
    }

+    if (criterionNodes.front()->template Is<ComputationNode<half>>())
+    {
+        InvalidArgument("TrainOrAdaptModel: using Float16 for loss function may cause overflow, please cast to float.");
+    }
+
    // This code is only relevant for the new (V2) readers. It exists because of
    // a shortcoming in DecimateMinibatchInPlace, which does not yet work when inputs 
    // in the same minibatch have different layouts, which is something only V2 readers can
@ -333,7 +341,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
    // initializing weights and gradient holder
    // only one criterion so far TODO: support multiple ones?
    auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]);
-    list<Matrix<ElemType>> smoothedGradients;
+    list<MatrixBasePtr> smoothedGradients;
    vector<double> smoothedCounts; // currently used by FSAdaGradUpdate()
    size_t numParameters = 0;

@ -344,9 +352,30 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
        // Note: We don't actually need the smoothedGradients if !IsParameterUpdateRequired().
        // However, this is hard to fix since lots of code assumes smoothedGradients to be in the same order as learnableNodes.
        // V2 API fixes this.
-        smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
-                                                     node->Value().GetNumCols(),
-                                                     net->GetDeviceId()));
+        MatrixBasePtr smoothedGradientPtr;
+        size_t numRows = node->Value().GetNumRows();
+        size_t numCols = node->Value().GetNumCols();
+        if (std::is_same<ElemType, half>())
+        {
+            // For half parameters, we use float smoothed gradients
+            // Allocate 3 times the size for casting parameter and gradients to float
+            const size_t c_smoothed_gradients_factor = 3;
+            shared_ptr<Matrix<float>> compoundMatrixPtr = std::make_shared<Matrix<float>>(numRows,
+                numCols * c_smoothed_gradients_factor,
+                net->GetDeviceId());
+            // Initialize float parameters
+            auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+            parameterMatrix.CastAssignValuesOf(node->Value());
+
+            smoothedGradientPtr = compoundMatrixPtr;
+        }
+        else
+        {
+            smoothedGradientPtr = std::make_shared<Matrix<ElemType>>(numRows,
+                numCols,
+                net->GetDeviceId());
+        }
+        smoothedGradients.push_back(smoothedGradientPtr);
        smoothedCounts.push_back(0);
        if (node->IsParameterUpdateRequired())
        {
@ -987,7 +1016,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                                    const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                    StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer?
                                    const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                    std::list<Matrix<ElemType>>& smoothedGradients, vector<double>& smoothedCounts,
+                                    std::list<MatrixBasePtr>& smoothedGradients, vector<double>& smoothedCounts,
                                    /*out*/ EpochCriterion& epochCriterion,
                                    /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                    const std::string& prefixMsg,
@ -1389,7 +1418,25 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                if (node->IsParameterUpdateRequired())
                {
 #ifdef _DEBUG
-                    if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
+                    bool hasNan = false;
+                    if (std::is_same<ElemType, half>())
+                    {
+                        // Get metrix from compound metrix
+                        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
+                        if (compoundMatrixPtr)
+                        {
+                            size_t numCols = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().GetNumCols();
+
+                            auto smoothedGradient = compoundMatrixPtr->ColumnSlice(0, numCols);
+                            hasNan = smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): ");
+                        }
+                    }
+                    else
+                    {
+                        auto smoothedGradient = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
+                        hasNan = smoothedGradient && smoothedGradient->HasNan("TrainOneEpoch/UpdateWeights(): ");
+                    }
+                    if (hasNan)
                        LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
                    double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
@ -1811,7 +1858,7 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
                                             const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                             StreamMinibatchInputs* inputMatrices,
                                             const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                             std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                             std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                             const bool learnRateInitialized,
                                             const double largestPrevLearnRatePerSample)
 {
@ -1985,7 +2032,7 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
                                              const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                              StreamMinibatchInputs* inputMatrices,
                                              const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                              std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                              std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                              const double learningRateAdjustmentFactor)
 {
    size_t minMinibatchSize = initialMinibatchSize;
@ -2086,7 +2133,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
                                                 const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                                 StreamMinibatchInputs* inputMatrices,
                                                 const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                 std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                                 std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                                 const size_t minMinibatchSize, const size_t maxMinibatchSize)
 {
    // may happen for automatically reduced learning rates
@ -2190,7 +2237,7 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
                                                    const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                                    StreamMinibatchInputs* inputMatrices,
                                                    const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                    std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                                    std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                                    /*out*/ EpochCriterion& epochCriterion,
                                                    /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                                    std::string prefixMsg,
@ -2264,6 +2311,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
    }
 }

+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+{
+    if (Globals::UseV2Aggregator())
+    {
+        auto communicator = ::CNTK::QuantizedMPICommunicator(zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, nBits);
+        return std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, useAsyncAggregation, traceLevel, syncStatsTrace);
+    }
+    else
+        return std::make_shared<AllReduceDistGradAggregator<ElemType>>(mpi, nBits, zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, useAsyncAggregation, traceLevel, syncStatsTrace);
+}
+
+template <>
+std::shared_ptr<IDistGradAggregator<half>> _GetAllReduceDistGradAggregator<half>(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+{
+    RuntimeError("SGD - half not supported for quantization!");
+}
+
 template <class ElemType>
 void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel)
 {
@ -2274,13 +2339,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
        if (traceLevel > 0)
            fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
 #ifdef CNTK_PARALLEL_TRAINING_SUPPORT
-        if (Globals::UseV2Aggregator())
-        {
-            auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
-            m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
-        }
-        else
-            m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+        m_distGradAgg = _GetAllReduceDistGradAggregator<ElemType>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
 #else
        RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
 #endif // !CNTK_PARALLEL_TRAINING_SUPPORT
@ -2289,15 +2348,38 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
    {
        if (traceLevel > 0)
            fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits);
-        if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
-            m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator(m_packThresholdSizeInBytes, m_useFP16AllReduce));
-        else
-            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
+        m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes, m_useFP16AllReduce);
    }

    m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
 }

+template <class ElemType>
+shared_ptr<IMASGD<ElemType>> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
+{
+    assert(!Globals::UseV2Aggregator());
+    return make_shared<BlockMomentumSGD<ElemType>>(mpi, traceLevel, devID, useNesterovBlockMomentum, resetSGDMomentum, blockLearningRate, blockMomentumAsTimeConstant, modelAggregationBlockSize);
+}
+
+template <>
+shared_ptr<IMASGD<half>> _GetBlockMomentumSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
+{
+    assert(!Globals::UseV2Aggregator());
+    RuntimeError("SGD - half not supported when useV2Aggregator is false!");
+}
+
+template <class ElemType>
+shared_ptr<IMASGD<ElemType>> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
+{
+    return make_shared<BasicModelAveragingSGD<ElemType>>(mpi, traceLevel, devID);
+}
+
+template <>
+shared_ptr<IMASGD<half>> _GetBasicModelAveragingSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
+{
+    RuntimeError("SGD - half not supported for modelAveragingSGD");
+}
+
 template <class ElemType>
 void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID)
 {
@ -2307,7 +2389,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
    }
    if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD)
    {
-        m_pMASGDHelper = make_shared<BasicModelAveragingSGD<ElemType>>(m_mpi, traceLevel, devID);
+        m_pMASGDHelper = _GetBasicModelAveragingSGD<ElemType>(m_mpi, traceLevel, devID);
    }
    else if (GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
    {
@ -2329,7 +2411,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
                m_modelAggregationBlockSize);
        }
        else
-            m_pMASGDHelper = make_shared<BlockMomentumSGD<ElemType>>(m_mpi, traceLevel, devID, 
+            m_pMASGDHelper = _GetBlockMomentumSGD<ElemType>(m_mpi, traceLevel, devID,
                                                                 m_useNesterovBlockMomentum, m_resetSGDMomentum, 
                                                                 m_blockLearningRate, m_blockMomentumAsTimeConstant, 
                                                                 m_modelAggregationBlockSize);
@ -2341,6 +2423,47 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
 // UpdateWeights() - actual weight update, implementing various update rules
 template <class ElemType>
 void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
+    MatrixBasePtr& smoothedGradientValues, double& smoothedCount,
+    const double learnRatePerSample, const double momentumPerSample,
+    size_t actualMBSize,
+    const double L2RegWeight, const double L1RegWeight,
+    const bool needAveMultiplier,
+    const bool useNesterovMomentum) const
+{
+    if (std::is_same<ElemType, half>())
+    {
+        // Get metrix from compound metrix
+        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (smoothedGradientValues);
+        size_t numCols = functionValues.GetNumCols();
+
+        auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
+        auto tempGradientMatrix = compoundMatrixPtr->ColumnSlice(numCols, numCols);
+        auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+
+        // Cast gradients to float
+        tempGradientMatrix.CastAssignValuesOf(gradientValues);
+
+        // Update
+        TypedUpdateWeights<float>(parameterMatrix, tempGradientMatrix, smoothedGradientMatrix, smoothedCount,
+            learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
+            needAveMultiplier, useNesterovMomentum);
+
+        // Cast parameter back to half
+        functionValues.CastAssignValuesOf(parameterMatrix);
+
+    }
+    else
+    {
+        auto sgv = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradientValues);
+        TypedUpdateWeights<>(functionValues, gradientValues, *sgv, smoothedCount,
+            learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
+            needAveMultiplier, useNesterovMomentum);
+    }
+}
+
+template <class ElemType1>
+template <class ElemType>
+void SGD<ElemType1>::TypedUpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
                                  Matrix<ElemType>& smoothedGradientValues, double& smoothedCount,
                                  const double learnRatePerSample, const double momentumPerSample,
                                              size_t actualMBSize,
@ -2363,7 +2486,7 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
    assert(actualMBSize > 0);

    // clipping gradients to prevent outliers
-    ClipGradient(gradientValues, actualMBSize);
+    ClipGradient<ElemType>(gradientValues, actualMBSize);

    GradientsUpdateType adpType = GradUpdateType();
    double noiseStd = GradientUpdateNoiseStd();
@ -2453,8 +2576,9 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
 }

 // protected:
+template <class ElemType1>
 template <class ElemType>
-void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+void SGD<ElemType1>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
 {
    if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
    {
@ -2474,10 +2598,30 @@ void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actual
    }
 }

+template <class ElemType>
+static void SaveSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
+{
+    auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
+    if (!smoothedGradientPtr)
+        RuntimeError("Failed to cast, type mismatch");
+    const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
+    fstream << smoothedGradientValues;
+}
+
+template <class ElemType>
+static void LoadSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
+{
+    auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
+    if (!smoothedGradientPtr)
+        RuntimeError("Failed to cast, type mismatch");
+    Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
+    fstream >> smoothedGradientValues;
+}
+
 template <class ElemType>
 void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
                                       const double learnRatePerSample,
-                                       const std::list<Matrix<ElemType>>& smoothedGradients,
+                                       const std::list<MatrixBasePtr>& smoothedGradients,
                                       const std::vector<double>& smoothedCounts,
                                       const double prevCriterion,
                                       const size_t minibatchSize)
@ -2510,10 +2654,12 @@ void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSam

            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");

-            for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+            for (auto smoothedGradient : smoothedGradients)
            {
-                const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
-                fstream << smoothedGradientValues;
+                if (std::is_same<ElemType, half>())
+                    SaveSmoothedGradient<float>(fstream, smoothedGradient);
+                else
+                    SaveSmoothedGradient<ElemType>(fstream, smoothedGradient);
            }

            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
@ -2553,7 +2699,7 @@ template <class ElemType>
 bool SGD<ElemType>::TryLoadCheckPointInfo(const size_t epochNumber,
                                          /*out*/ size_t& totalSamplesSeen,
                                          /*out*/ double& learnRatePerSample,
-                                          std::list<Matrix<ElemType>>& smoothedGradients,
+                                          std::list<MatrixBasePtr>& smoothedGradients,
                                          std::vector<double>& smoothedCounts,
                                          /*out*/ double& prevCriterion,
                                          /*out*/ size_t& minibatchSize)
@ -2582,7 +2728,7 @@ template <class ElemType>
 void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
                                       /*out*/ size_t& totalSamplesSeen,
                                       /*out*/ double& learnRatePerSample,
-                                       std::list<Matrix<ElemType>>& smoothedGradients,
+                                       std::list<MatrixBasePtr>& smoothedGradients,
                                       std::vector<double>& smoothedCounts,
                                       /*out*/ double& prevCriterion,
                                       /*out*/ size_t& minibatchSize)
@ -2600,6 +2746,9 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
    }

+    if (ckpVersion > CURRENT_CNTK_CHECKPOINT_VERSION)
+        RuntimeError("The checkpoint file has a newer format version (%d) than this CNTK version can handle (%d).", (int)ckpVersion, (int)CURRENT_CNTK_CHECKPOINT_VERSION);
+
    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");

    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
@ -2618,10 +2767,12 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,

    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");

-    for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+    for (auto smoothedGradient : smoothedGradients)
    {
-        Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
-        fstream >> smoothedGradientValues;
+        if (std::is_same<ElemType, half>())
+            LoadSmoothedGradient<float>(fstream, smoothedGradient);
+        else
+            LoadSmoothedGradient<ElemType>(fstream, smoothedGradient);
    }
    fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");

@ -2824,6 +2975,7 @@ void SGD<ElemType>::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNet
        nodeIter->SetEvalTimeStampOutdatedWrtAll();
 }

+template class SGD<half>;
 template class SGD<float>;
 template class SGD<double>;

@ -3306,12 +3458,14 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
 static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
 {
    wstring precision = configp->Get(L"precision");
-    if (precision == L"float")
+    if (precision == L"float16")
+        return sizeof(half);
+    else if (precision == L"float")
        return sizeof(float);
    else if (precision == L"double")
        return sizeof(double);
    else
-        RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
+        RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
 }

 SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -25,7 +25,8 @@ using namespace std; // ugh! TODO: get rid of this from .h files!!!

 #define CNTK_CHECKPOINT_VERSION_1 1     // 1 -> no version number 
 #define CNTK_CHECKPOINT_VERSION_2 2      
-#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_2
+#define CNTK_CHECKPOINT_VERSION_3 3     // float smoothed gradients for float16/half parameters
+#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_3

 namespace CNTK { namespace Internal {
    // Forward declarations.
@ -442,7 +443,7 @@ protected:
                                  const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                  StreamMinibatchInputs* inputMatrices,
                                  const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                  std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                  std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                  const bool learnRateInitialized,
                                  const double largestPrevLearnRatePerSample);

@ -458,7 +459,7 @@ protected:
                                         const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                         StreamMinibatchInputs* inputMatrices,
                                         const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                         std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                         std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                         /*out*/ EpochCriterion& epochCriterion,
                                         /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                         std::string prefixMsg,
@ -478,7 +479,7 @@ protected:
                                   const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                   StreamMinibatchInputs* inputMatrices,
                                   const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                   std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                   std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                   const double learningRateAdjustmentFactor);

    // uses a small percentage of training data of minibatch to
@ -496,7 +497,7 @@ protected:
                                      const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                      StreamMinibatchInputs* inputMatrices,
                                      const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                      std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                      std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                      const size_t minMinibatchSize, const size_t maxMinibatchSize);

    // Attempts to compute the error signal for the whole utterance, which will
@ -523,7 +524,7 @@ protected:
                         const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                         StreamMinibatchInputs* inputMatrices,
                         const std::list<ComputationNodeBasePtr>& learnableNodes,
-                         std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double>& smoothedCounts,
+                         std::list<MatrixBasePtr>& smoothedGradients, std::vector<double>& smoothedCounts,
                         /*out*/ EpochCriterion& epochCriterion,
                         /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                         const std::string& prefixMsg = "",
@ -534,26 +535,37 @@ protected:

    void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel);
    void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
-public:
+private:
    // UpdateWeights() - actual weight update, implementing various update rules
    void UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
-                       Matrix<ElemType>& smoothedGradient, double& smoothedCount,
+        MatrixBasePtr& smoothedGradient, double& smoothedCount,
        const double learnRatePerSample, const double momentumPerSample,
        size_t actualMBSize,
        const double L2RegWeight, const double L1RegWeight,
        const bool needAveMultiplier,
        const bool useNesterovMomentum) const;
+
+    template<class ElemType2 = ElemType>
+    void TypedUpdateWeights(Matrix<ElemType2>& functionValues, Matrix<ElemType2>& gradientValues,
+                       Matrix<ElemType2>& smoothedGradient, double& smoothedCount,
+                       const double learnRatePerSample, const double momentumPerSample,
+                       size_t actualMBSize,
+                       const double L2RegWeight, const double L1RegWeight,
+                       const bool needAveMultiplier,
+                       const bool useNesterovMomentum) const;
+public:
    // return -1 if nothing exists
    int DetermineStartEpoch(const bool makeMode);

    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false) const;

 protected:
-    void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
+    template<class ElemType2 = ElemType>
+    void ClipGradient(Matrix<ElemType2>& gradient, const size_t actualMBSize) const;

    void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
                            const double learnRatePerSample,
-                            const std::list<Matrix<ElemType>>& smoothedGradients,
+                            const std::list<MatrixBasePtr>& smoothedGradients,
                            const std::vector<double>& smoothedCounts,
                            const double prevCriterion,
                            const size_t minibatchSize);
@ -561,14 +573,14 @@ protected:
    bool TryLoadCheckPointInfo(const size_t epochNumber,
                               /*out*/ size_t& totalSamplesSeen,
                               /*out*/ double& learnRatePerSample,
-                               std::list<Matrix<ElemType>>& smoothedGradients,
+                               std::list<MatrixBasePtr>& smoothedGradients,
                               std::vector<double>& smoothedCounts,
                               /*out*/ double& prevCriterion,
                               /*out*/ size_t& minibatchSize);
    void LoadCheckPointInfo(const size_t epochNumber,
                            /*out*/ size_t& totalSamplesSeen,
                            /*out*/ double& learnRatePerSample,
-                            std::list<Matrix<ElemType>>& smoothedGradients,
+                            std::list<MatrixBasePtr>& smoothedGradients,
                            std::vector<double>& smoothedCounts,
                            /*out*/ double& prevCriterion,
                            /*out*/ size_t& minibatchSize);
--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -137,6 +137,7 @@
    <ClInclude Include="MASGD.h" />
    <ClInclude Include="PostComputingActions.h" />
    <ClInclude Include="SimpleDistGradAggregator.h" />
+    <ClInclude Include="SimpleDistGradAggregatorHelper.h" />
    <ClInclude Include="SimpleEvaluator.h" />
    <ClInclude Include="SimpleOutputWriter.h" />
    <ClInclude Include="SGD.h" />
@ -149,6 +150,7 @@
    <ClCompile Include="PostComputingActions.cpp" />
    <ClCompile Include="Profiler.cpp" />
    <ClCompile Include="SGD.cpp" />
+    <ClCompile Include="SimpleDistGradAggregatorHelper.cpp" />
    <ClCompile Include="stdafx.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@ -16,6 +16,9 @@
    <ClCompile Include="ASGDHelper.cpp">
      <Filter>Parallelization</Filter>
    </ClCompile>
+    <ClCompile Include="SimpleDistGradAggregatorHelper.cpp">
+      <Filter>Parallelization</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\fileutil.h">
@ -144,6 +147,9 @@
    <ClInclude Include="AccumulatorAggregation.h">
      <Filter>Parallelization</Filter>
    </ClInclude>
+    <ClInclude Include="SimpleDistGradAggregatorHelper.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
--- a/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp
+++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp
@ -0,0 +1,82 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma warning(disable : 4267) // conversion from size_t to int or other types
+
+#include "Basics.h"
+#include "MPIWrapper.h"
+#include "Matrix.h"
+#include "SimpleDistGradAggregatorHelper.h"
+#include "DistGradHeader.h"
+#include "IDistGradAggregator.h"
+#include "SimpleDistGradAggregator.h"
+#include "V2SimpleDistGradAggregator.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce)
+{
+    if (Globals::UseV2Aggregator())
+        return std::make_shared<V2SimpleDistGradAggregator<ElemType>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
+    else
+        return std::make_shared<SimpleDistGradAggregator<ElemType>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            packThresholdSizeInBytes);
+}
+
+template <>
+std::shared_ptr<IDistGradAggregator<half>> GetSimpleDistGradAggregator<half>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce)
+{
+    if (Globals::UseV2Aggregator())
+        return std::make_shared<V2SimpleDistGradAggregator<half>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
+    else
+        RuntimeError("SGD - half not supported when useV2Aggregator is false!");
+}
+
+template std::shared_ptr<IDistGradAggregator<float>> GetSimpleDistGradAggregator<float>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce);
+
+template std::shared_ptr<IDistGradAggregator<double>> GetSimpleDistGradAggregator<double>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce);
+
+}}}
--- a/Source/SGDLib/SimpleDistGradAggregatorHelper.h
+++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.h
@ -0,0 +1,24 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Constants.h"
+#include "IDistGradAggregator.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES,
+    bool useFP16AllReduce = false);
+
+}}}
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -5,8 +5,6 @@

 #pragma once

-#include "V2SimpleDistGradAggregator.h"
-
 #include "AccumulatorAggregation.h"
 #include "Basics.h"
 #include "DataReader.h"
@ -18,7 +16,7 @@
 #include "ProgressTracing.h"
 #include "DistGradHeader.h"
 #include "IDistGradAggregator.h"
-#include "SimpleDistGradAggregator.h"
+#include "SimpleDistGradAggregatorHelper.h"
 #include "Criterion.h"
 #include "Globals.h"

@ -167,10 +165,7 @@ public:
                        DistGradHeader::Destroy(ptr);
                    });

-                    if (Globals::UseV2Aggregator())
-                        m_distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator());
-                    else 
-                        m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
+                    m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
                }

                m_gradHeader->numEvalNode = evalNodes.size();
--- a/Source/SGDLib/V2SimpleDistGradAggregator.h
+++ b/Source/SGDLib/V2SimpleDistGradAggregator.h
@ -109,7 +109,7 @@ public:

                // Synchronize the Quantization compute stream with the completion of
                // compute of the gradient matrices on the main compute stream
-                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
+                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
                delete mainStreamSyncEvent;

                AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
@ -185,7 +185,7 @@ private:
            if (m_useAsyncAggregation)
            {
                std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
-                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
+                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
            }
        }

--- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
+++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion)
    // This is a watch guard to make sure that any change in the model version will be detected. 
    // If you change the CNTK model version, please do not silently adapt this test. 
    // Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release.
-    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 30, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
+    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 31, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
 }

 BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)
--- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
+++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
@ -61,7 +61,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;Cntk.SGD-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(BOOST_LIB_PATH);$(NvmlLibPath)</AdditionalLibraryDirectories>
      <DelayLoadDLLs>Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>