From 4003c087a19669d1c75806aaf958991222fb1a71 Mon Sep 17 00:00:00 2001
From: rpengms <40006668+rpengms@users.noreply.github.com>
Date: Wed, 20 Mar 2019 11:36:16 -0700
Subject: [PATCH] Merge fp16 brainscript work (#3606)

* FP16 BrainScript - address code review comments

* Remove Tab and fix debug build breaks

* Fix Linux Build breaks

* fp16 brain script - add _CRT_SECURE_NO_WARNINGS

* fp16 brain script - fix NetworkTests

* Update tests for model version change

* Remove changes for InputAndParamNodes

* Fix typo

* Remove redundant code

* Fix optional parameters
---
 Makefile                                      |   1 +
 Source/1BitSGD/BlockMomentumSGD.h             |  12 +-
 Source/1BitSGD/V2BlockMomentumSGD.h           |  50 ++--
 Source/ActionsLib/NDLNetworkBuilder.cpp       | 117 ++++++++-
 Source/ActionsLib/NDLNetworkBuilder.h         |   6 +-
 .../ActionsLib/NetworkDescriptionLanguage.cpp |   7 +
 .../ActionsLib/NetworkDescriptionLanguage.h   |   1 +
 Source/ActionsLib/NetworkFactory.cpp          |   3 +
 Source/ActionsLib/SimpleNetworkBuilder.cpp    |   1 +
 Source/ActionsLib/SimpleNetworkBuilder.h      |  10 +-
 Source/ActionsLib/TrainActions.cpp            |   8 +-
 Source/CNTK/CNTK.cpp                          | 161 ++++++++-----
 Source/Common/Include/Config.h                |   7 +
 .../ComputationNetwork.cpp                    |  28 +++
 .../ComputationNetwork.h                      |   8 +
 .../ComputationNetworkBuilder.cpp             |  63 ++++-
 .../ComputationNetworkBuilder.h               |  19 +-
 .../ComputationNetworkLib/ComputationNode.h   |   4 +-
 Source/EvalDll/CNTKEval.cpp                   |   3 +
 Source/SGDLib/ASGDHelper.cpp                  |  14 ++
 Source/SGDLib/AccumulatorAggregation.h        |  22 +-
 Source/SGDLib/DataReaderHelpers.h             |   4 +-
 Source/SGDLib/MASGD.h                         |  12 +-
 Source/SGDLib/SGD.cpp                         | 228 +++++++++++++++---
 Source/SGDLib/SGD.h                           |  36 ++-
 Source/SGDLib/SGDLib.vcxproj                  |   2 +
 Source/SGDLib/SGDLib.vcxproj.filters          |   6 +
 .../SGDLib/SimpleDistGradAggregatorHelper.cpp |  82 +++++++
 .../SGDLib/SimpleDistGradAggregatorHelper.h   |  24 ++
 Source/SGDLib/SimpleEvaluator.h               |   9 +-
 Source/SGDLib/V2SimpleDistGradAggregator.h    |   4 +-
 .../UnitTests/EvalTests/EvalExtendedTests.cpp |   2 +-
 .../NetworkTests/NetworkTests.vcxproj         |   2 +-
 33 files changed, 769 insertions(+), 187 deletions(-)
 create mode 100644 Source/SGDLib/SimpleDistGradAggregatorHelper.cpp
 create mode 100644 Source/SGDLib/SimpleDistGradAggregatorHelper.h
diff --git a/Makefile b/Makefile
index bbdee61b2..cb651d080 100644
--- a/Makefile
+++ b/Makefile
@@ -707,6 +707,7 @@ SGDLIB_SRC=\
 	$(SOURCEDIR)/SGDLib/Profiler.cpp \
 	$(SOURCEDIR)/SGDLib/SGD.cpp \
 	$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
+	$(SOURCEDIR)/SGDLib/SimpleDistGradAggregatorHelper.cpp \
 
 SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC)
 
diff --git a/Source/1BitSGD/BlockMomentumSGD.h b/Source/1BitSGD/BlockMomentumSGD.h
index b96476973..ca2d1c388 100644
--- a/Source/1BitSGD/BlockMomentumSGD.h
+++ b/Source/1BitSGD/BlockMomentumSGD.h
@@ -94,15 +94,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 );
         }
         /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes, 
-            std::list<Matrix<ElemType>>&                smoothedGradient,
+            std::list<MatrixBasePtr>&                   smoothedGradients,
             size_t                                      samplesSinceLastSync) override
         {
-            Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync);
+            Base::OnEpochEnd(LearnableNodes, smoothedGradients, samplesSinceLastSync);
         }
         /*virtual*/ void ModelAggregationProcessing(
             size_t samplesSinceLastSync,
             const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
             size_t& totalSamplesProcessed,
             float& secondsOnCommunication
             ) override
@@ -181,9 +181,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //----------------------------------------
             if (m_resetSGDMomentumAfterAggregation)
             {
-                for (Matrix<ElemType>& x : smoothedGradient)
+                for (auto sg : smoothedGradients)
                 {
-                    x.SetValue((ElemType)0);
+                    auto x = dynamic_pointer_cast<Matrix<ElemType>>(sg);
+                    if (x != nullptr)
+                        x->SetValue((ElemType)0);
                 }
             }
         }
diff --git a/Source/1BitSGD/V2BlockMomentumSGD.h b/Source/1BitSGD/V2BlockMomentumSGD.h
index e9ededc92..805f238d3 100644
--- a/Source/1BitSGD/V2BlockMomentumSGD.h
+++ b/Source/1BitSGD/V2BlockMomentumSGD.h
@@ -108,7 +108,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         bool OnArrivingAtSyncPoint(
             const std::list<ComputationNodeBasePtr>& learnableNodes,        /* input/output: */
-            std::list<Matrix<ElemType>>& smoothedGradient,                  /* input/output: under some setup, it will reset to zero*/
+            std::list<MatrixBasePtr>& smoothedGradients,                    /* input/output: under some setup, it will reset to zero*/
             size_t  samplesSinceLastSync                                    /* input:  samples processed since last sync on this worker only */
             ) override
         {
@@ -130,12 +130,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             // Otherwise let update the weights.
             float secondsOnCommunication = 0.0f;
             size_t totalSamples = 0;
-            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
             return true;
         }
 
         /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
             size_t samplesSinceLastSync) override
         {
             if (!m_someWorkerHasFinished)
@@ -152,13 +152,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             // Let's update our weights no matter what.
             float secondsOnCommunication = 0.0f;
             size_t totalSamples = 0;
-            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
         }
 
         /*virtual*/ void ModelAggregationProcessing(
             size_t /*samplesSinceLastSync*/,
             const std::list<ComputationNodeBasePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradient,
+            std::list<MatrixBasePtr>& smoothedGradients,
             size_t&                                   /*totalSamplesProcessed*/,   /* out */
             float&                                    secondsOnCommunication   /* out */
             ) override
@@ -196,8 +196,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers());
 
             // 2. Let's update the model
-            for (auto& pBaseNode : learnableNodes)
+            auto smoothedGradientIter = smoothedGradients.begin();
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
             {
+                ComputationNodeBasePtr pBaseNode = *nodeIter;
                 if (!pBaseNode->IsParameterUpdateRequired())
                     continue;
 
@@ -235,15 +237,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     // 2.2.4 update bookkeeping
                     prevWeight.SetValue(currentWeight);
                 }
-            }
-            //----------------------------------------
-            // 3. reset SGD momentum if necessary 
-            //----------------------------------------
-            if (m_resetSGDMomentumAfterAggregation)
-            {
-                for (Matrix<ElemType>& x : smoothedGradient)
+
+                //----------------------------------------
+                // 3. reset SGD momentum if necessary 
+                //----------------------------------------
                 {
-                    x.SetValue((ElemType)0);
+                    // For half, we keep a copy of float weights, update that too
+                    if (std::is_same<ElemType, half>())
+                    {
+                        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
+                        size_t numCols = currentWeight.GetNumCols();
+
+                        auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+                        parameterMatrix.CastAssignValuesOf(currentWeight);
+
+                        if (m_resetSGDMomentumAfterAggregation)
+                        {
+                            // Only reset smoothed gradients
+                            auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
+                            smoothedGradientMatrix.SetValue(0.0f);
+                        }
+                    }
+                    else
+                    {
+                        if (m_resetSGDMomentumAfterAggregation)
+                        {
+                            auto x = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
+                            x->SetValue((ElemType)0);
+                        }
+                    }
                 }
             }
         }
diff --git a/Source/ActionsLib/NDLNetworkBuilder.cpp b/Source/ActionsLib/NDLNetworkBuilder.cpp
index dbcf036a2..734ece489 100644
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@@ -22,6 +22,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 using namespace std;
 
+template <class ElemType, class TargetType>
+static inline bool isprecision(std::wstring& str)
+{
+    if ((str == L"") && std::is_same<ElemType, TargetType>())
+        return true;
+    if (std::is_same<TargetType, half>())
+        return EqualCI(str, L"float16");
+    else if (std::is_same<TargetType, float>())
+        return EqualCI(str, L"float");
+    else if (std::is_same<TargetType, double>())
+        return EqualCI(str, L"double");
+    return false;
+}
+
 template <class ElemType>
 void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
 {
@@ -48,7 +62,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
 
     std::wstring cnNodeType = Microsoft::MSR::CNTK::ToFixedWStringFromMultiByte(node->GetValue());
 
-    ComputationNodePtr nodePtr;
+    ComputationNodeBasePtr nodePtr;
 
     // get the node pointer for the node, should be stored in the EvalValue;
     if (pass > ndlPassInitial)
@@ -56,7 +70,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
         nodePtr = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
         if (!nodePtr)
         {
-            nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
+            nodePtr = m_net->GetNodeFromName(name);
             node->SetEvalValue(nodePtr.get());
         }
     }
@@ -75,15 +89,48 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
             auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
 
             wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
+            wstring precision = node->GetOptionalParameter("precision", "");
+
             // TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
             // first look for this node already existing in the network
             // BUGBUG: How does this set the dimensions then?
             if (m_net->NodeNameExists(name))
-                nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
-            else if (isSparse)
-                nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
+                nodePtr = m_net->GetNodeFromName(name);
             else
-                nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
+            {
+                if (precision == L"")
+                {
+                    if (isSparse)
+                        nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
+                }
+                else if (EqualCI(precision, L"float"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<float>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<float>(name, tensorShape, dynamicAxis);
+                }
+                else if (EqualCI(precision, L"double"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<double>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<double>(name, tensorShape, dynamicAxis);
+                }
+                else if (EqualCI(precision, L"float16"))
+                {
+                    if (isSparse)
+                        nodePtr = builder.template TypedCreateSparseInputNode<half>(name, tensorShape, dynamicAxis);
+                    else
+                        nodePtr = builder.template TypedCreateInputNode<half>(name, tensorShape, dynamicAxis);
+                }
+                else
+                {
+                    RuntimeError("NDLNetworkBuilder: Input: the 'precision' parameter if specified, must be 'float', 'double' or 'float16'.");
+                }
+            }
         }
     }
     else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
@@ -193,7 +240,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
             nodePtr = builder.CreateLearnableParameter(name, rows, cols);
             nodePtr->SetLearningRateMultiplier(0);
         }
-        else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
+        else if (pass == ndlPassFinal || (dynamic_pointer_cast<ComputationNode<ElemType>> (nodePtr))->Value().GetNumElements() != 0)
         {
             ElemType val = parameter[0]->GetScalar();
             m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
@@ -607,6 +654,56 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
             nodeParamCount = nodePtr->GetNumInputs();
         }
     }
+    else if (cnNodeType == OperationName2Of(CastNode))
+    {
+        if (parameter.size() < 1)
+            RuntimeError("%ls should have 1 or more parameters (node and cast precision).", cnNodeType.c_str());
+
+        // setup the parameter position of children so we can hook them up later
+        nodeParamCount = 1;
+        nodeParamStart = 0;
+
+        if (pass == ndlPassInitial)
+        {
+            // evaluate only scalar parameters
+            vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+            auto sourceNode = (NDLNode<ElemType>*) params[0];
+            wstring sourcePrecision = sourceNode->GetOptionalParameter("precision", "");
+            wstring targetPrecision = node->GetOptionalParameter("precision", "");
+            if (EqualCI(targetPrecision, L"float16"))
+            {
+                ComputationNetworkBuilder<half> builder2(*m_net);
+                if (isprecision<ElemType, float>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<float>(name);
+                else if (isprecision<ElemType, double>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<double>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to half, input must be  'float' or 'double'");
+            }
+            else if (EqualCI(targetPrecision, L"float"))
+            {
+                ComputationNetworkBuilder<float> builder2(*m_net);
+                if (isprecision<ElemType, half>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<half>(name);
+                else if (isprecision<ElemType, double>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<double>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to float, input must be  'float16' or 'double'");
+            }
+            else if (EqualCI(targetPrecision, L"double"))
+            {
+                ComputationNetworkBuilder<double> builder2(*m_net);
+                if (isprecision<ElemType, float>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<float>(name);
+                else if (isprecision<ElemType, half>(sourcePrecision))
+                    nodePtr = builder2.CreateCastNode<half>(name);
+                else
+                    RuntimeError("NDLNetworkBuilder: for CastNode to cast to double, input must be  'float' or 'float16'");
+            }
+            else
+                RuntimeError("NDLNetworkBuilder: CastNode - need to specify 'precision' parameter: 'float', 'double' or 'float16'.");
+        }
+    }
     else
     {
 
@@ -645,7 +742,10 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
 #if 1
             vector<ComputationNodeBasePtr> inputNodes;
             for (let& in : inputs)
-                inputNodes.push_back(ComputationNode<ElemType>::FromVoidPtr(in));
+            {
+                ComputationNodeBase *p = (ComputationNodeBase *)in;
+                inputNodes.push_back(p ? p->shared_from_this() : nullptr);
+            }
 
             nodePtr->AttachInputs(inputNodes);
 #else       // TODO: delete this
@@ -714,6 +814,7 @@ TensorShape NDLNodeEvaluatorImpl<ElemType>::ProcessTensorShapeParameters(const N
     return TensorShape(dims);
 }
 
+template class NDLBuilderImpl<half>;
 template class NDLBuilderImpl<float>;
 template class NDLBuilderImpl<double>;
 
diff --git a/Source/ActionsLib/NDLNetworkBuilder.h b/Source/ActionsLib/NDLNetworkBuilder.h
index 1f40ef1fd..53b2727af 100644
--- a/Source/ActionsLib/NDLNetworkBuilder.h
+++ b/Source/ActionsLib/NDLNetworkBuilder.h
@@ -269,10 +269,11 @@ public:
     }
 
     // ProcessOptionalParameters - Process the optional parameters of a node
-    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
+    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node) override
     {
         vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
-        auto compNode = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
+        ComputationNodeBase* compNodePtr = (ComputationNodeBase *) (node->GetEvalValue());
+        ComputationNodeBasePtr compNode = compNodePtr ? compNodePtr->shared_from_this() : nullptr;
         std::string empty;
 
         // loop through all the optional parameters processing them as necessary
@@ -582,6 +583,7 @@ private:
     DEVICEID_TYPE m_deviceId;
 };
 
+template class NDLBuilder<half>;
 template class NDLBuilder<float>;
 template class NDLBuilder<double>;
 
diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.cpp b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
index 9f9976a72..401a76985 100644
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
     else if (EqualInsensitive(nodeType, OperationNameOf(AtanhNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(BatchNormalizationNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationName2Of(CastNode))) ret = true;
 #ifdef COMING_SOON
     else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
 #endif
@@ -267,18 +268,24 @@ NDLScript<ElemType> NDLScript<ElemType>::s_global("global");
 
 // declare the static variables from the classes
 template <>
+NDLScript<half> NDLScript<half>::s_global{};
+template <>
 NDLScript<float> NDLScript<float>::s_global{};
 template <>
 NDLScript<double> NDLScript<double>::s_global{};
 
+template <>
+int NDLNode<half>::s_nameCounter = 0;
 template <>
 int NDLNode<float>::s_nameCounter = 0;
 template <>
 int NDLNode<double>::s_nameCounter = 0;
 
+template class NDLNode<half>;
 template class NDLNode<float>;
 template class NDLNode<double>;
 
+template class NDLScript<half>;
 template class NDLScript<float>;
 template class NDLScript<double>;
 
diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.h b/Source/ActionsLib/NetworkDescriptionLanguage.h
index ce44c96db..322f45082 100644
--- a/Source/ActionsLib/NetworkDescriptionLanguage.h
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.h
@@ -98,6 +98,7 @@ public:
     }
 };
 
+template class NDLNodeEvaluator<half>;
 template class NDLNodeEvaluator<float>;
 template class NDLNodeEvaluator<double>;
 
diff --git a/Source/ActionsLib/NetworkFactory.cpp b/Source/ActionsLib/NetworkFactory.cpp
index 21c9b9346..281e311bd 100644
--- a/Source/ActionsLib/NetworkFactory.cpp
+++ b/Source/ActionsLib/NetworkFactory.cpp
@@ -188,9 +188,12 @@ ComputationNetworkPtr GetModelFromConfig(const ConfigRecordType& config, const w
     return net;
 }
 
+template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
+template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, half>(const ConfigParameters& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, float>(const ConfigParameters& config);
 template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, double>(const ConfigParameters& config);
+template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, half> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
 template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, float> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
 template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, double>(const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
diff --git a/Source/ActionsLib/SimpleNetworkBuilder.cpp b/Source/ActionsLib/SimpleNetworkBuilder.cpp
index ecc793744..dc19b0056 100644
--- a/Source/ActionsLib/SimpleNetworkBuilder.cpp
+++ b/Source/ActionsLib/SimpleNetworkBuilder.cpp
@@ -1775,6 +1775,7 @@ shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAn
     return output;
 }
 
+template class SimpleNetworkBuilder<half>;
 template class SimpleNetworkBuilder<float>;
 template class SimpleNetworkBuilder<double>;
 
diff --git a/Source/ActionsLib/SimpleNetworkBuilder.h b/Source/ActionsLib/SimpleNetworkBuilder.h
index ff823597f..b3fc39f72 100644
--- a/Source/ActionsLib/SimpleNetworkBuilder.h
+++ b/Source/ActionsLib/SimpleNetworkBuilder.h
@@ -159,9 +159,13 @@ public:
         m_constInputGateValue  = config("constInputGateValue",  "false");
         m_constOutputGateValue = config("constOutputGateValue", "false");
 
-        m_forgetGateInitVal = config("forgetGateInitVal", "-1");
-        m_inputGateInitVal  = config("inputGateInitVal",  "-1");
-        m_outputGateInitVal = config("outputGateInitVal", "-1");
+        ElemType forgetGateInitVal = config("forgetGateInitVal", "-1");
+        ElemType inputGateInitVal = config("inputGateInitVal", "-1");
+        ElemType outputGateInitVal = config("outputGateInitVal", "-1");
+
+        m_forgetGateInitVal = forgetGateInitVal;
+        m_inputGateInitVal = inputGateInitVal;
+        m_outputGateInitVal = outputGateInitVal;
 
         m_sparse_input = config("sparseinput", "false");
 
diff --git a/Source/ActionsLib/TrainActions.cpp b/Source/ActionsLib/TrainActions.cpp
index e3b75654e..a3b178d3f 100644
--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@@ -142,12 +142,14 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
 {
     const IConfigRecord& config = *configp;
     wstring precision = config[L"precision"]; // dispatch on ElemType
-    if (precision == L"float")
+    if (precision == L"float16")
+        DoTrain<IConfigRecord, half>(config);
+    else if (precision == L"float")
         DoTrain<IConfigRecord, float>(config);
     else if (precision == L"double")
         DoTrain<IConfigRecord, double>(config);
     else
-        RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
+        RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
 
     return make_shared<Object>(); // return a dummy object
 }
@@ -156,8 +158,10 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<TrainAction> registerTrainAction(L"TrainAction");
 }}}
 
+template void DoTrain<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
 template void DoTrain<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
 template void DoTrain<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
+template void DoTrain<ConfigParameters, half>(const ConfigParameters& config);
 template void DoTrain<ConfigParameters, float>(const ConfigParameters& config);
 template void DoTrain<ConfigParameters, double>(const ConfigParameters& config);
 
diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index 892f667a5..d58f36be2 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -171,6 +171,91 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con
 // be run in parallel across multiple ranks. Others should only run on rank 0
 const std::set<std::string> commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" };
 
+
+template <typename ElemType>
+bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& config)
+{
+    if (thisAction == "train" || thisAction == "trainRNN")
+    {
+        DoTrain<ConfigParameters, ElemType>(commandParams);
+    }
+    else if (thisAction == "bnstat")
+    {
+        DoBatchNormalizationStat<ElemType>(commandParams);
+    }
+    else if (thisAction == "adapt")
+    {
+        DoAdapt<ElemType>(commandParams);
+    }
+    else if (thisAction == "test" || thisAction == "eval")
+    {
+        DoEval<ElemType>(commandParams);
+    }
+    else if (thisAction == "edit")
+    {
+        DoEdit<ElemType>(commandParams);
+    }
+    else if (thisAction == "cv")
+    {
+        DoCrossValidate<ElemType>(commandParams);
+    }
+    else if (thisAction == "write")
+    {
+        DoWriteOutput<ElemType>(commandParams);
+    }
+    else if (thisAction == "devtest")
+    {
+        TestCn<ElemType>(config); // for "devtest" action pass the root config instead
+    }
+    else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
+    {
+        DoDumpNodes<ElemType>(commandParams);
+    }
+    else if (thisAction == "convertdbn")
+    {
+        DoConvertFromDbn<ElemType>(commandParams);
+    }
+    else if (thisAction == "exportdbn")
+    {
+        DoExportToDbn<ElemType>(commandParams);
+    }
+    else if (thisAction == "createLabelMap")
+    {
+        DoCreateLabelMap<ElemType>(commandParams);
+    }
+    else if (thisAction == "writeWordAndClass")
+    {
+        DoWriteWordAndClassInfo<ElemType>(commandParams);
+    }
+    else if (thisAction == "plot")
+    {
+        DoTopologyPlot<ElemType>(commandParams);
+    }
+    else if (thisAction == "SVD")
+    {
+        DoParameterSVD<ElemType>(commandParams);
+    }
+    else
+    {
+        return false;
+    }
+    return true;
+}
+
+template <>
+bool DispatchThisAction<half>(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& )
+{
+    if (thisAction == "train" || thisAction == "trainRNN")
+    {
+        DoTrain<ConfigParameters, half>(commandParams);
+    }
+    else
+    {
+        RuntimeError("half only supported for action train or trainRNN!");
+    }
+    return true;
+}
+
 // process the command
 template <typename ElemType>
 void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mpi)
@@ -270,73 +355,21 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
                     {
                         LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
                     }
-                    DoTrain<ConfigParameters, ElemType>(commandParams);
+                }
+
+                if (!DispatchThisAction<ElemType>(thisAction, commandParams, config))
+                {
+                    RuntimeError("unknown action: %s  in command set: %s", thisAction.c_str(), command[i].c_str());
+                }
+
+                if (thisAction == "train" || thisAction == "trainRNN")
+                {
                     if (progressTracing)
                     {
                         LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
                     }
                     fullEpochsOffset += GetMaxEpochs(commandParams);
                 }
-                else if (thisAction == "bnstat")
-                {
-                    DoBatchNormalizationStat<ElemType>(commandParams);
-                }
-                else if (thisAction == "adapt")
-                {
-                    DoAdapt<ElemType>(commandParams);
-                }
-                else if (thisAction == "test" || thisAction == "eval")
-                {
-                    DoEval<ElemType>(commandParams);
-                }
-                else if (thisAction == "edit")
-                {
-                    DoEdit<ElemType>(commandParams);
-                }
-                else if (thisAction == "cv")
-                {
-                    DoCrossValidate<ElemType>(commandParams);
-                }
-                else if (thisAction == "write")
-                {
-                    DoWriteOutput<ElemType>(commandParams);
-                }
-                else if (thisAction == "devtest")
-                {
-                    TestCn<ElemType>(config); // for "devtest" action pass the root config instead
-                }
-                else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
-                {
-                    DoDumpNodes<ElemType>(commandParams);
-                }
-                else if (thisAction == "convertdbn")
-                {
-                    DoConvertFromDbn<ElemType>(commandParams);
-                }
-                else if (thisAction == "exportdbn")
-                {
-                    DoExportToDbn<ElemType>(commandParams);
-                }
-                else if (thisAction == "createLabelMap")
-                {
-                    DoCreateLabelMap<ElemType>(commandParams);
-                }
-                else if (thisAction == "writeWordAndClass")
-                {
-                    DoWriteWordAndClassInfo<ElemType>(commandParams);
-                }
-                else if (thisAction == "plot")
-                {
-                    DoTopologyPlot<ElemType>(commandParams);
-                }
-                else if (thisAction == "SVD")
-                {
-                    DoParameterSVD<ElemType>(commandParams);
-                }
-                else
-                {
-                    RuntimeError("unknown action: %s  in command set: %s", thisAction.c_str(), command[i].c_str());
-                }
             }
 
             fprintf(stderr, "\n");
@@ -740,12 +773,14 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
         LOGPRINTF(stderr, "precision = \"%s\"\n", type.c_str());
     }
 
-    if (type == "float")
+    if (type == "float16")
+        DoCommands<half>(config, mpi);
+    else if (type == "float")
         DoCommands<float>(config, mpi);
     else if (type == "double")
         DoCommands<double>(config, mpi);
     else
-        RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float\" or \"double\"", type.c_str());
+        RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float16\" or \"float\" or \"double\"", type.c_str());
 
     // if completed then write a doneFile if requested
     if (!doneFile.empty())
diff --git a/Source/Common/Include/Config.h b/Source/Common/Include/Config.h
index 0adf2e357..e9f1c09c1 100644
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@@ -8,6 +8,8 @@
 #include <map>
 #include <stdexcept>
 #include <stdint.h>
+#include "File.h"
+#include "half.hpp"
 
 using namespace std;
 
@@ -150,6 +152,11 @@ public:
         return (float) (double) *this;
     }
 
+    operator half() const
+    {
+        return (half)(double)*this;
+    }
+
 private:
     long tolong() const
     {
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 6801f5b45..ecdaeb243 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -150,6 +150,20 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
         else if (nodePtr->Is<ComputationNode<half>>())
             precision = ElemTypeName<half>();
         else LogicError("Unexpected node type.");
+#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
+        if (nodePtr->Is<CastNode<half,float>>())
+            precision = ElemTypeName2<half,float>();
+        else if (nodePtr->Is<CastNode<half, double>>())
+            precision = ElemTypeName2<half, double>();
+        else if (nodePtr->Is<CastNode<float, half>>())
+            precision = ElemTypeName2<float, half>();
+        else if (nodePtr->Is<CastNode<float, double>>())
+            precision = ElemTypeName2<float, double>();
+        else if (nodePtr->Is<CastNode<double, half>>())
+            precision = ElemTypeName2<double, half>();
+        else if (nodePtr->Is<CastNode<double, float>>())
+            precision = ElemTypeName2<double, float>();
+#endif
         fstream << precision;
 #endif
         fstream << nodePtr->OperationName();
@@ -265,6 +279,20 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
             node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
         else if (precision == L"") // old file format: default to <ElemType>
             node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
+#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
+        else if (precision == L"half,float")
+            node = ComputationNetworkBuilder<half>::NewNode2<float>(opName, m_deviceId, nodeName);
+        else if (precision == L"half,double")
+            node = ComputationNetworkBuilder<half>::NewNode2<double>(opName, m_deviceId, nodeName);
+        else if (precision == L"float,half")
+            node = ComputationNetworkBuilder<float>::NewNode2<half>(opName, m_deviceId, nodeName);
+        else if (precision == L"float,double")
+            node = ComputationNetworkBuilder<float>::NewNode2<double>(opName, m_deviceId, nodeName);
+        else if (precision == L"double,half")
+            node = ComputationNetworkBuilder<double>::NewNode2<half>(opName, m_deviceId, nodeName);
+        else if (precision == L"double,float")
+            node = ComputationNetworkBuilder<double>::NewNode2<float>(opName, m_deviceId, nodeName);
+#endif
         else
             RuntimeError("Read: Unexpected precision tag '%ls'", precision.c_str());
 
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 85d6922da..64975e585 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -1313,6 +1313,14 @@ template <> /*static*/ inline const wchar_t* ElemTypeName<float>()  { return L"f
 template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
 template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }
 
+template <typename ElemType, typename ElemType2> static inline const wchar_t* ElemTypeName2();
+template <> /*static*/ inline const wchar_t* ElemTypeName2<float,half>() { return L"float,half"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<float,double>() { return L"float,double"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<double,half>() { return L"double,half"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<double,float>() { return L"double,float"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<half,float>() { return L"half,float"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName2<half,double>() { return L"half,double"; }
+
 // The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
 // The corresponding Matrix<float> is emitted in the SetDeviceId function above.
 template class Matrix<double>;
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index 113cdb22b..2d79ddfea 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -175,6 +175,13 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
     else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
 }
 
+template <class ElemType, class ElemType2, class... _Types>
+static shared_ptr<ComputationNode<ElemType>> CreateNode2(const std::wstring& nodeType, _Types&&... _Args)
+{
+    // check more types
+    if (nodeType == OperationName2Of(CastNode))       return New<CastNode<ElemType, ElemType2>>(forward<_Types>(_Args)...);
+    else RuntimeError("CreateNode2: unsupported nodeType - %S", nodeType.c_str());
+}
 // this function is called from SimpleNetworkBuilder and old NDL
 template <class ElemType>
 /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
@@ -189,6 +196,13 @@ template <class ElemType>
     return CreateNode<ElemType>(nodeType, deviceId, name);
 }
 
+template <class ElemType>
+template <class ElemType2>
+/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
+{
+    return CreateNode2<ElemType, ElemType2>(nodeType, deviceId, name);
+}
+
 shared_ptr<ComputationNodeBase> NewComputationNodeFromConfig(const Microsoft::MSR::ScriptableObjects::IConfigRecordPtr configp)
 {
     wstring precision = configp->Get(L"precision"); // dispatch on ElemType
@@ -247,15 +261,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
 }
 
 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
+template <class ValueType>
+shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
+    return net.AddNodeToNetWithElemType(New<InputValue<ValueType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
 }
 
 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
+template <class ValueType>
+shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
+    return net.AddNodeToNetWithElemType(New<SparseInputValue<ValueType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
 }
 
 template <class ElemType>
@@ -318,6 +334,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
 {
     return net.AddNodeToNetWithElemType(New<ReconcileDynamicAxisNode<ElemType>>(net.GetDeviceId(), nodeName));
 }
+template <class ElemType>
+template <class InputNodeType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateCastNode(const std::wstring& nodeName)
+{
+    return net.AddNodeToNetWithElemType(New<CastNode<ElemType, InputNodeType>>(net.GetDeviceId(), nodeName));
+}
 
 // this is the catch-all for all cases not covered as special cases above
 // Unlike the specialized ones above, this one creates nodes by type given as a string.
@@ -997,4 +1019,37 @@ template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::Typ
 template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
 template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
 
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<half>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<double>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<half>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<float>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<float>(const std::wstring& nodeName);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<double>(const std::wstring& nodeName);
+
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
 }}}
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
index 826c582c3..604dcab58 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@@ -38,6 +38,8 @@ public:
     // TODO: move into a separate header/class, to decouple from this class which would then be only used by old NDL and SimpleNetworkBuilder.
     static ComputationNodePtr NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
     static ComputationNodePtr NewNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
+    template <class ElemType2>
+    static ComputationNodePtr NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
 
     // The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
     // There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
@@ -53,12 +55,25 @@ public:
     template<class ValueType>
     shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
 
+    template <class InputNodeType>
+    shared_ptr<ComputationNode<ElemType>> CreateCastNode(const std::wstring& nodeName);
+
     // sparse matrix size is optionally specified
     // ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
     ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
     ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
-    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
-    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
+    shared_ptr<ComputationNode<ElemType>> CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
+    {
+        return this->template TypedCreateInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
+    }
+    template<class ValueType>
+    shared_ptr<ComputationNode<ValueType>> TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
+    shared_ptr<ComputationNode<ElemType>> CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
+    {
+        return this->template TypedCreateSparseInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
+    }
+    template<class ValueType>
+    shared_ptr<ComputationNode<ValueType>> TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
     ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                              const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                                              bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 6c2e9e9ae..8d9e25d68 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -61,7 +61,8 @@
 #define CNTK_MODEL_VERSION_28 28 // Padding op
 #define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS
 #define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30
+#define CNTK_MODEL_VERSION_31 31 // Cast node
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_31
 
 // helper mode for debugging
 // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
@@ -95,6 +96,7 @@ struct /*interface*/ IComputationNode
     // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
     virtual const std::wstring OperationName() const = 0;
 #define OperationNameOf(T) (T<float>::TypeName()) // convenience macro
+#define OperationName2Of(T) (T<double,float>::TypeName()) // convenience macro
 
     virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps.
 
diff --git a/Source/EvalDll/CNTKEval.cpp b/Source/EvalDll/CNTKEval.cpp
index 736b0e75f..f4058c161 100644
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@@ -4,6 +4,9 @@
 //
 // CNTKEval.cpp : Defines the exported functions for the CNTK DLL.
 //
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
 
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
diff --git a/Source/SGDLib/ASGDHelper.cpp b/Source/SGDLib/ASGDHelper.cpp
index aeca17930..4ae5238d7 100644
--- a/Source/SGDLib/ASGDHelper.cpp
+++ b/Source/SGDLib/ASGDHelper.cpp
@@ -646,6 +646,20 @@ ASGDHelper<ElemType>* NewASGDHelper(
 #endif
 }
 
+template<> ASGDHelper<half>* NewASGDHelper<half>(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,
+    size_t nodeNumRanks,
+    bool useAsyncBuffer,
+    bool isSimulatedModelAveragingSGD,
+    AdjustLearningRateAtBeginning adjusttype,
+    double adjustCoef,
+    size_t adjustPerMinibatches,
+    int traceLevel,
+    int syncPerfStats)
+{
+    RuntimeError("NewASGDHelper - half not supported!");
+}
+
 template ASGDHelper<float>* NewASGDHelper<float>(
     const std::list<ComputationNodeBasePtr> & learnableNodes,
     size_t nodeNumRanks,
diff --git a/Source/SGDLib/AccumulatorAggregation.h b/Source/SGDLib/AccumulatorAggregation.h
index 4c3ff6095..a24c1d3b9 100644
--- a/Source/SGDLib/AccumulatorAggregation.h
+++ b/Source/SGDLib/AccumulatorAggregation.h
@@ -18,6 +18,7 @@
 #include "Matrix.h"
 #include "SimpleDistGradAggregator.h"
 #include "V2SimpleDistGradAggregator.h"
+#include "SimpleDistGradAggregatorHelper.h"
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
@@ -46,21 +47,12 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
     }
 
     // Prepare aggregator.
-    std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg;
-    if (Globals::UseV2Aggregator())
-        distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(
-            mpi,
-            false /*useAsyncAggregation*/,
-            net->GetDeviceId(),
-            0 /*syncStatsTrace*/,
-            ::CNTK::MPICommunicator(packThresholdSizeInBytes));
-    else
-        distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(
-            mpi,
-            false /*useAsyncAggregation*/,
-            net->GetDeviceId(),
-            0 /*syncStatsTrace*/,
-            packThresholdSizeInBytes);
+    std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg = GetSimpleDistGradAggregator<ElemType>(
+        mpi,
+        false /*useAsyncAggregation*/,
+        net->GetDeviceId(),
+        0 /*syncStatsTrace*/,
+        packThresholdSizeInBytes);
 
     // Prepare header.
     const size_t c_evalNodes = 1;
diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h
index 3fb8c750a..d4ceeb360 100644
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@@ -24,10 +24,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (const auto& iter : inputMatrices)
             matrices.insert(iter.second.matrix);
         for (auto& node : net->FeatureNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+            if (matrices.find(node->ValuePtr()) != matrices.end())
                 node->NotifyFunctionValuesMBSizeModified();
         for (auto& node : net->LabelNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+            if (matrices.find(node->ValuePtr()) != matrices.end())
                 node->NotifyFunctionValuesMBSizeModified();
     }
 
diff --git a/Source/SGDLib/MASGD.h b/Source/SGDLib/MASGD.h
index 1a828e990..a9f1708ee 100644
--- a/Source/SGDLib/MASGD.h
+++ b/Source/SGDLib/MASGD.h
@@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
          }
 
          virtual void OnEpochEnd(const std::list<ComputationNodeBasePtr>&    LearnableNodes,
-                                    std::list<Matrix<ElemType>>&                smoothedGradient, 
+                                    std::list<MatrixBasePtr>&                   smoothedGradients,
                                     size_t                                      samplesSinceLastSync 
                                     )
          {
@@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
              if (read2sync)
              {
                  m_numSyncPerformed++;
-                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
+                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
                  m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
              }
              
@@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
          virtual bool OnArrivingAtSyncPoint(
             const std::list<ComputationNodeBasePtr>& LearnableNodes,        /* input/output: */
-            std::list<Matrix<ElemType>>& smoothedGradient,                  /* input/output: under some setup, it will reset to zero*/
+            std::list<MatrixBasePtr>& smoothedGradients,                     /* input/output: under some setup, it will reset to zero*/
             size_t  samplesSinceLastSync                                    /* input:  samples processed since last sync on this worker only */
              )
          {
@@ -190,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
              if (read2Sync)
              {
                  m_numSyncPerformed++;
-                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
+                 ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
                  m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
              }
              return read2Sync;
@@ -199,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
          virtual void ModelAggregationProcessing(
              size_t samplesSinceLastSync,                                       /* in: */
              const std::list<ComputationNodeBasePtr>&  learnableNodes,          /* in/out */
-             std::list<Matrix<ElemType>>&              smoothedGradient,        /* in/out */
+             std::list<MatrixBasePtr>&                 smoothedGradients,       /* in/out */
              size_t&                                   totalSamplesProcessed,   /* out */
              float&                                    secondsOnCommunication   /* out */) = 0; 
          
@@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void ModelAggregationProcessing(
             size_t samplesSinceLastSync,                                       /* in */
             const std::list<ComputationNodeBasePtr>&  learnableNodes,          /* in/out */
-            std::list<Matrix<ElemType>>&              smoothedGradient,        /* in/out */
+            std::list<MatrixBasePtr>&                 smoothedGradients,       /* in/out */
             size_t&                                   totalSamplesProcessed,   /* out */
             float&                                    secondsOnCommunication   /* out */) override
             // NOTE: the variable type is determined by the interface in SGD::TrainOneEpoch
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 9a09a9698..dcee286fb 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -31,6 +31,7 @@
 #include "ASGDHelper.h"
 
 #include "CNTKLibraryInternals.h"
+#include "SimpleDistGradAggregatorHelper.h"
 #include "SimpleDistGradAggregator.h"
 #include "V2SimpleDistGradAggregator.h"
 #include "ProgressTracing.h"
@@ -47,8 +48,10 @@ using namespace std;
 // class SGD
 // =======================================================================
 
+template SGD<half>::SGD(const ConfigParameters&);
 template SGD<float>::SGD(const ConfigParameters&);
 template SGD<double>::SGD(const ConfigParameters&);
+template SGD<half>::SGD(const ScriptableObjects::IConfigRecord&);
 template SGD<float>::SGD(const ScriptableObjects::IConfigRecord&);
 template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
 
@@ -223,6 +226,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
         }
     }
 
+    if (criterionNodes.front()->template Is<ComputationNode<half>>())
+    {
+        InvalidArgument("TrainOrAdaptModel: using Float16 for loss function may cause overflow, please cast to float.");
+    }
+
     // This code is only relevant for the new (V2) readers. It exists because of
     // a shortcoming in DecimateMinibatchInPlace, which does not yet work when inputs 
     // in the same minibatch have different layouts, which is something only V2 readers can
@@ -333,7 +341,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
     // initializing weights and gradient holder
     // only one criterion so far TODO: support multiple ones?
     auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]);
-    list<Matrix<ElemType>> smoothedGradients;
+    list<MatrixBasePtr> smoothedGradients;
     vector<double> smoothedCounts; // currently used by FSAdaGradUpdate()
     size_t numParameters = 0;
 
@@ -344,9 +352,30 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
         // Note: We don't actually need the smoothedGradients if !IsParameterUpdateRequired().
         // However, this is hard to fix since lots of code assumes smoothedGradients to be in the same order as learnableNodes.
         // V2 API fixes this.
-        smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
-                                                     node->Value().GetNumCols(),
-                                                     net->GetDeviceId()));
+        MatrixBasePtr smoothedGradientPtr;
+        size_t numRows = node->Value().GetNumRows();
+        size_t numCols = node->Value().GetNumCols();
+        if (std::is_same<ElemType, half>())
+        {
+            // For half parameters, we use float smoothed gradients
+            // Allocate 3 times the size for casting parameter and gradients to float
+            const size_t c_smoothed_gradients_factor = 3;
+            shared_ptr<Matrix<float>> compoundMatrixPtr = std::make_shared<Matrix<float>>(numRows,
+                numCols * c_smoothed_gradients_factor,
+                net->GetDeviceId());
+            // Initialize float parameters
+            auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+            parameterMatrix.CastAssignValuesOf(node->Value());
+
+            smoothedGradientPtr = compoundMatrixPtr;
+        }
+        else
+        {
+            smoothedGradientPtr = std::make_shared<Matrix<ElemType>>(numRows,
+                numCols,
+                net->GetDeviceId());
+        }
+        smoothedGradients.push_back(smoothedGradientPtr);
         smoothedCounts.push_back(0);
         if (node->IsParameterUpdateRequired())
         {
@@ -987,7 +1016,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                                     const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                     StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer?
                                     const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                    std::list<Matrix<ElemType>>& smoothedGradients, vector<double>& smoothedCounts,
+                                    std::list<MatrixBasePtr>& smoothedGradients, vector<double>& smoothedCounts,
                                     /*out*/ EpochCriterion& epochCriterion,
                                     /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                     const std::string& prefixMsg,
@@ -1389,7 +1418,25 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                 if (node->IsParameterUpdateRequired())
                 {
 #ifdef _DEBUG
-                    if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
+                    bool hasNan = false;
+                    if (std::is_same<ElemType, half>())
+                    {
+                        // Get metrix from compound metrix
+                        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
+                        if (compoundMatrixPtr)
+                        {
+                            size_t numCols = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().GetNumCols();
+
+                            auto smoothedGradient = compoundMatrixPtr->ColumnSlice(0, numCols);
+                            hasNan = smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): ");
+                        }
+                    }
+                    else
+                    {
+                        auto smoothedGradient = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
+                        hasNan = smoothedGradient && smoothedGradient->HasNan("TrainOneEpoch/UpdateWeights(): ");
+                    }
+                    if (hasNan)
                         LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
                     double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
@@ -1811,7 +1858,7 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
                                              const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                              StreamMinibatchInputs* inputMatrices,
                                              const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                             std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                             std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                              const bool learnRateInitialized,
                                              const double largestPrevLearnRatePerSample)
 {
@@ -1985,7 +2032,7 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
                                               const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                               StreamMinibatchInputs* inputMatrices,
                                               const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                              std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                              std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                               const double learningRateAdjustmentFactor)
 {
     size_t minMinibatchSize = initialMinibatchSize;
@@ -2086,7 +2133,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
                                                  const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                                  StreamMinibatchInputs* inputMatrices,
                                                  const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                 std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                                 std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                                  const size_t minMinibatchSize, const size_t maxMinibatchSize)
 {
     // may happen for automatically reduced learning rates
@@ -2190,7 +2237,7 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
                                                     const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                                     StreamMinibatchInputs* inputMatrices,
                                                     const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                    std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
+                                                    std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
                                                     /*out*/ EpochCriterion& epochCriterion,
                                                     /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                                     std::string prefixMsg,
@@ -2264,6 +2311,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
     }
 }
 
+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+{
+    if (Globals::UseV2Aggregator())
+    {
+        auto communicator = ::CNTK::QuantizedMPICommunicator(zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, nBits);
+        return std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, useAsyncAggregation, traceLevel, syncStatsTrace);
+    }
+    else
+        return std::make_shared<AllReduceDistGradAggregator<ElemType>>(mpi, nBits, zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, useAsyncAggregation, traceLevel, syncStatsTrace);
+}
+
+template <>
+std::shared_ptr<IDistGradAggregator<half>> _GetAllReduceDistGradAggregator<half>(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+{
+    RuntimeError("SGD - half not supported for quantization!");
+}
+
 template <class ElemType>
 void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel)
 {
@@ -2274,13 +2339,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
         if (traceLevel > 0)
             fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
 #ifdef CNTK_PARALLEL_TRAINING_SUPPORT
-        if (Globals::UseV2Aggregator())
-        {
-            auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
-            m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
-        }
-        else
-            m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+        m_distGradAgg = _GetAllReduceDistGradAggregator<ElemType>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
 #else
         RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
 #endif // !CNTK_PARALLEL_TRAINING_SUPPORT
@@ -2289,15 +2348,38 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
     {
         if (traceLevel > 0)
             fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits);
-        if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
-            m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator(m_packThresholdSizeInBytes, m_useFP16AllReduce));
-        else
-            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
+        m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes, m_useFP16AllReduce);
     }
 
     m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
 }
 
+template <class ElemType>
+shared_ptr<IMASGD<ElemType>> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
+{
+    assert(!Globals::UseV2Aggregator());
+    return make_shared<BlockMomentumSGD<ElemType>>(mpi, traceLevel, devID, useNesterovBlockMomentum, resetSGDMomentum, blockLearningRate, blockMomentumAsTimeConstant, modelAggregationBlockSize);
+}
+
+template <>
+shared_ptr<IMASGD<half>> _GetBlockMomentumSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
+{
+    assert(!Globals::UseV2Aggregator());
+    RuntimeError("SGD - half not supported when useV2Aggregator is false!");
+}
+
+template <class ElemType>
+shared_ptr<IMASGD<ElemType>> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
+{
+    return make_shared<BasicModelAveragingSGD<ElemType>>(mpi, traceLevel, devID);
+}
+
+template <>
+shared_ptr<IMASGD<half>> _GetBasicModelAveragingSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
+{
+    RuntimeError("SGD - half not supported for modelAveragingSGD");
+}
+
 template <class ElemType>
 void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID)
 {
@@ -2307,7 +2389,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
     }
     if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD)
     {
-        m_pMASGDHelper = make_shared<BasicModelAveragingSGD<ElemType>>(m_mpi, traceLevel, devID);
+        m_pMASGDHelper = _GetBasicModelAveragingSGD<ElemType>(m_mpi, traceLevel, devID);
     }
     else if (GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
     {
@@ -2329,7 +2411,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
                 m_modelAggregationBlockSize);
         }
         else
-            m_pMASGDHelper = make_shared<BlockMomentumSGD<ElemType>>(m_mpi, traceLevel, devID, 
+            m_pMASGDHelper = _GetBlockMomentumSGD<ElemType>(m_mpi, traceLevel, devID,
                                                                  m_useNesterovBlockMomentum, m_resetSGDMomentum, 
                                                                  m_blockLearningRate, m_blockMomentumAsTimeConstant, 
                                                                  m_modelAggregationBlockSize);
@@ -2341,6 +2423,47 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
 // UpdateWeights() - actual weight update, implementing various update rules
 template <class ElemType>
 void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
+    MatrixBasePtr& smoothedGradientValues, double& smoothedCount,
+    const double learnRatePerSample, const double momentumPerSample,
+    size_t actualMBSize,
+    const double L2RegWeight, const double L1RegWeight,
+    const bool needAveMultiplier,
+    const bool useNesterovMomentum) const
+{
+    if (std::is_same<ElemType, half>())
+    {
+        // Get metrix from compound metrix
+        auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (smoothedGradientValues);
+        size_t numCols = functionValues.GetNumCols();
+
+        auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
+        auto tempGradientMatrix = compoundMatrixPtr->ColumnSlice(numCols, numCols);
+        auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
+
+        // Cast gradients to float
+        tempGradientMatrix.CastAssignValuesOf(gradientValues);
+
+        // Update
+        TypedUpdateWeights<float>(parameterMatrix, tempGradientMatrix, smoothedGradientMatrix, smoothedCount,
+            learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
+            needAveMultiplier, useNesterovMomentum);
+
+        // Cast parameter back to half
+        functionValues.CastAssignValuesOf(parameterMatrix);
+
+    }
+    else
+    {
+        auto sgv = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradientValues);
+        TypedUpdateWeights<>(functionValues, gradientValues, *sgv, smoothedCount,
+            learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
+            needAveMultiplier, useNesterovMomentum);
+    }
+}
+
+template <class ElemType1>
+template <class ElemType>
+void SGD<ElemType1>::TypedUpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
                                   Matrix<ElemType>& smoothedGradientValues, double& smoothedCount,
                                   const double learnRatePerSample, const double momentumPerSample,
                                               size_t actualMBSize,
@@ -2363,7 +2486,7 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
     assert(actualMBSize > 0);
 
     // clipping gradients to prevent outliers
-    ClipGradient(gradientValues, actualMBSize);
+    ClipGradient<ElemType>(gradientValues, actualMBSize);
 
     GradientsUpdateType adpType = GradUpdateType();
     double noiseStd = GradientUpdateNoiseStd();
@@ -2453,8 +2576,9 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
 }
 
 // protected:
+template <class ElemType1>
 template <class ElemType>
-void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+void SGD<ElemType1>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
 {
     if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
     {
@@ -2474,10 +2598,30 @@ void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actual
     }
 }
 
+template <class ElemType>
+static void SaveSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
+{
+    auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
+    if (!smoothedGradientPtr)
+        RuntimeError("Failed to cast, type mismatch");
+    const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
+    fstream << smoothedGradientValues;
+}
+
+template <class ElemType>
+static void LoadSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
+{
+    auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
+    if (!smoothedGradientPtr)
+        RuntimeError("Failed to cast, type mismatch");
+    Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
+    fstream >> smoothedGradientValues;
+}
+
 template <class ElemType>
 void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
                                        const double learnRatePerSample,
-                                       const std::list<Matrix<ElemType>>& smoothedGradients,
+                                       const std::list<MatrixBasePtr>& smoothedGradients,
                                        const std::vector<double>& smoothedCounts,
                                        const double prevCriterion,
                                        const size_t minibatchSize)
@@ -2510,10 +2654,12 @@ void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSam
 
             fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
-            for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+            for (auto smoothedGradient : smoothedGradients)
             {
-                const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
-                fstream << smoothedGradientValues;
+                if (std::is_same<ElemType, half>())
+                    SaveSmoothedGradient<float>(fstream, smoothedGradient);
+                else
+                    SaveSmoothedGradient<ElemType>(fstream, smoothedGradient);
             }
 
             fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
@@ -2553,7 +2699,7 @@ template <class ElemType>
 bool SGD<ElemType>::TryLoadCheckPointInfo(const size_t epochNumber,
                                           /*out*/ size_t& totalSamplesSeen,
                                           /*out*/ double& learnRatePerSample,
-                                          std::list<Matrix<ElemType>>& smoothedGradients,
+                                          std::list<MatrixBasePtr>& smoothedGradients,
                                           std::vector<double>& smoothedCounts,
                                           /*out*/ double& prevCriterion,
                                           /*out*/ size_t& minibatchSize)
@@ -2582,7 +2728,7 @@ template <class ElemType>
 void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
                                        /*out*/ size_t& totalSamplesSeen,
                                        /*out*/ double& learnRatePerSample,
-                                       std::list<Matrix<ElemType>>& smoothedGradients,
+                                       std::list<MatrixBasePtr>& smoothedGradients,
                                        std::vector<double>& smoothedCounts,
                                        /*out*/ double& prevCriterion,
                                        /*out*/ size_t& minibatchSize)
@@ -2600,6 +2746,9 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
     }
 
+    if (ckpVersion > CURRENT_CNTK_CHECKPOINT_VERSION)
+        RuntimeError("The checkpoint file has a newer format version (%d) than this CNTK version can handle (%d).", (int)ckpVersion, (int)CURRENT_CNTK_CHECKPOINT_VERSION);
+
     fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
 
     fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
@@ -2618,10 +2767,12 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
 
     fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
-    for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+    for (auto smoothedGradient : smoothedGradients)
     {
-        Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
-        fstream >> smoothedGradientValues;
+        if (std::is_same<ElemType, half>())
+            LoadSmoothedGradient<float>(fstream, smoothedGradient);
+        else
+            LoadSmoothedGradient<ElemType>(fstream, smoothedGradient);
     }
     fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
 
@@ -2824,6 +2975,7 @@ void SGD<ElemType>::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNet
         nodeIter->SetEvalTimeStampOutdatedWrtAll();
 }
 
+template class SGD<half>;
 template class SGD<float>;
 template class SGD<double>;
 
@@ -2881,7 +3033,7 @@ static AdjustLearningRateAtBeginning AdjustLearningRateAtBeginningType(const wst
     else InvalidArgument("AdjustLearningRateatBeginningType: Invalid Type. Valid values are (None | Linearly | Staircase)");
 }
 #endif
-  
+
 template<class ConfigRecordType>
 SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
 {
@@ -3306,12 +3458,14 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
 static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
 {
     wstring precision = configp->Get(L"precision");
-    if (precision == L"float")
+    if (precision == L"float16")
+        return sizeof(half);
+    else if (precision == L"float")
         return sizeof(float);
     else if (precision == L"double")
         return sizeof(double);
     else
-        RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
+        RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
 }
 
 SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 623f1f114..b95419400 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -25,7 +25,8 @@ using namespace std; // ugh! TODO: get rid of this from .h files!!!
 
 #define CNTK_CHECKPOINT_VERSION_1 1     // 1 -> no version number 
 #define CNTK_CHECKPOINT_VERSION_2 2      
-#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_2
+#define CNTK_CHECKPOINT_VERSION_3 3     // float smoothed gradients for float16/half parameters
+#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_3
 
 namespace CNTK { namespace Internal {
     // Forward declarations.
@@ -442,7 +443,7 @@ protected:
                                   const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                   StreamMinibatchInputs* inputMatrices,
                                   const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                  std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                  std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                   const bool learnRateInitialized,
                                   const double largestPrevLearnRatePerSample);
 
@@ -458,7 +459,7 @@ protected:
                                          const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                          StreamMinibatchInputs* inputMatrices,
                                          const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                         std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                         std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                          /*out*/ EpochCriterion& epochCriterion,
                                          /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                          std::string prefixMsg,
@@ -478,7 +479,7 @@ protected:
                                    const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                    StreamMinibatchInputs* inputMatrices,
                                    const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                   std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                   std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                    const double learningRateAdjustmentFactor);
 
     // uses a small percentage of training data of minibatch to
@@ -496,7 +497,7 @@ protected:
                                       const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                       StreamMinibatchInputs* inputMatrices,
                                       const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                      std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
+                                      std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
                                       const size_t minMinibatchSize, const size_t maxMinibatchSize);
 
     // Attempts to compute the error signal for the whole utterance, which will
@@ -523,7 +524,7 @@ protected:
                          const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                          StreamMinibatchInputs* inputMatrices,
                          const std::list<ComputationNodeBasePtr>& learnableNodes,
-                         std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double>& smoothedCounts,
+                         std::list<MatrixBasePtr>& smoothedGradients, std::vector<double>& smoothedCounts,
                          /*out*/ EpochCriterion& epochCriterion,
                          /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                          const std::string& prefixMsg = "",
@@ -534,26 +535,37 @@ protected:
 
     void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel);
     void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
-public:
+private:
     // UpdateWeights() - actual weight update, implementing various update rules
     void UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
-                       Matrix<ElemType>& smoothedGradient, double& smoothedCount,
+        MatrixBasePtr& smoothedGradient, double& smoothedCount,
+        const double learnRatePerSample, const double momentumPerSample,
+        size_t actualMBSize,
+        const double L2RegWeight, const double L1RegWeight,
+        const bool needAveMultiplier,
+        const bool useNesterovMomentum) const;
+
+    template<class ElemType2 = ElemType>
+    void TypedUpdateWeights(Matrix<ElemType2>& functionValues, Matrix<ElemType2>& gradientValues,
+                       Matrix<ElemType2>& smoothedGradient, double& smoothedCount,
                        const double learnRatePerSample, const double momentumPerSample,
                        size_t actualMBSize,
                        const double L2RegWeight, const double L1RegWeight,
                        const bool needAveMultiplier,
                        const bool useNesterovMomentum) const;
+public:
     // return -1 if nothing exists
     int DetermineStartEpoch(const bool makeMode);
 
     wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false) const;
 
 protected:
-    void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
+    template<class ElemType2 = ElemType>
+    void ClipGradient(Matrix<ElemType2>& gradient, const size_t actualMBSize) const;
 
     void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
                             const double learnRatePerSample,
-                            const std::list<Matrix<ElemType>>& smoothedGradients,
+                            const std::list<MatrixBasePtr>& smoothedGradients,
                             const std::vector<double>& smoothedCounts,
                             const double prevCriterion,
                             const size_t minibatchSize);
@@ -561,14 +573,14 @@ protected:
     bool TryLoadCheckPointInfo(const size_t epochNumber,
                                /*out*/ size_t& totalSamplesSeen,
                                /*out*/ double& learnRatePerSample,
-                               std::list<Matrix<ElemType>>& smoothedGradients,
+                               std::list<MatrixBasePtr>& smoothedGradients,
                                std::vector<double>& smoothedCounts,
                                /*out*/ double& prevCriterion,
                                /*out*/ size_t& minibatchSize);
     void LoadCheckPointInfo(const size_t epochNumber,
                             /*out*/ size_t& totalSamplesSeen,
                             /*out*/ double& learnRatePerSample,
-                            std::list<Matrix<ElemType>>& smoothedGradients,
+                            std::list<MatrixBasePtr>& smoothedGradients,
                             std::vector<double>& smoothedCounts,
                             /*out*/ double& prevCriterion,
                             /*out*/ size_t& minibatchSize);
diff --git a/Source/SGDLib/SGDLib.vcxproj b/Source/SGDLib/SGDLib.vcxproj
index e55d6a427..cacc70a99 100644
--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@@ -137,6 +137,7 @@
     <ClInclude Include="MASGD.h" />
     <ClInclude Include="PostComputingActions.h" />
     <ClInclude Include="SimpleDistGradAggregator.h" />
+    <ClInclude Include="SimpleDistGradAggregatorHelper.h" />
     <ClInclude Include="SimpleEvaluator.h" />
     <ClInclude Include="SimpleOutputWriter.h" />
     <ClInclude Include="SGD.h" />
@@ -149,6 +150,7 @@
     <ClCompile Include="PostComputingActions.cpp" />
     <ClCompile Include="Profiler.cpp" />
     <ClCompile Include="SGD.cpp" />
+    <ClCompile Include="SimpleDistGradAggregatorHelper.cpp" />
     <ClCompile Include="stdafx.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/Source/SGDLib/SGDLib.vcxproj.filters b/Source/SGDLib/SGDLib.vcxproj.filters
index 16d52d17c..0133ff25d 100644
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@@ -16,6 +16,9 @@
     <ClCompile Include="ASGDHelper.cpp">
       <Filter>Parallelization</Filter>
     </ClCompile>
+    <ClCompile Include="SimpleDistGradAggregatorHelper.cpp">
+      <Filter>Parallelization</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Common\Include\fileutil.h">
@@ -144,6 +147,9 @@
     <ClInclude Include="AccumulatorAggregation.h">
       <Filter>Parallelization</Filter>
     </ClInclude>
+    <ClInclude Include="SimpleDistGradAggregatorHelper.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Common">
diff --git a/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp b/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp
new file mode 100644
index 000000000..4e9b84f7b
--- /dev/null
+++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp
@@ -0,0 +1,82 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma warning(disable : 4267) // conversion from size_t to int or other types
+
+#include "Basics.h"
+#include "MPIWrapper.h"
+#include "Matrix.h"
+#include "SimpleDistGradAggregatorHelper.h"
+#include "DistGradHeader.h"
+#include "IDistGradAggregator.h"
+#include "SimpleDistGradAggregator.h"
+#include "V2SimpleDistGradAggregator.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce)
+{
+    if (Globals::UseV2Aggregator())
+        return std::make_shared<V2SimpleDistGradAggregator<ElemType>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
+    else
+        return std::make_shared<SimpleDistGradAggregator<ElemType>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            packThresholdSizeInBytes);
+}
+
+template <>
+std::shared_ptr<IDistGradAggregator<half>> GetSimpleDistGradAggregator<half>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce)
+{
+    if (Globals::UseV2Aggregator())
+        return std::make_shared<V2SimpleDistGradAggregator<half>>(
+            mpi,
+            useAsyncAggregation,
+            deviceId,
+            syncStatsTrace,
+            ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
+    else
+        RuntimeError("SGD - half not supported when useV2Aggregator is false!");
+}
+
+template std::shared_ptr<IDistGradAggregator<float>> GetSimpleDistGradAggregator<float>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce);
+
+template std::shared_ptr<IDistGradAggregator<double>> GetSimpleDistGradAggregator<double>(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes,
+    bool useFP16AllReduce);
+
+}}}
diff --git a/Source/SGDLib/SimpleDistGradAggregatorHelper.h b/Source/SGDLib/SimpleDistGradAggregatorHelper.h
new file mode 100644
index 000000000..21302980a
--- /dev/null
+++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.h
@@ -0,0 +1,24 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Constants.h"
+#include "IDistGradAggregator.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+template <class ElemType>
+std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
+    const MPIWrapperPtr& mpi,
+    bool useAsyncAggregation,
+    int deviceId,
+    int syncStatsTrace,
+    size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES,
+    bool useFP16AllReduce = false);
+
+}}}
diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h
index 2941c26d4..eadd2f2ee 100644
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@@ -5,8 +5,6 @@
 
 #pragma once
 
-#include "V2SimpleDistGradAggregator.h"
-
 #include "AccumulatorAggregation.h"
 #include "Basics.h"
 #include "DataReader.h"
@@ -18,7 +16,7 @@
 #include "ProgressTracing.h"
 #include "DistGradHeader.h"
 #include "IDistGradAggregator.h"
-#include "SimpleDistGradAggregator.h"
+#include "SimpleDistGradAggregatorHelper.h"
 #include "Criterion.h"
 #include "Globals.h"
 
@@ -167,10 +165,7 @@ public:
                         DistGradHeader::Destroy(ptr);
                     });
 
-                    if (Globals::UseV2Aggregator())
-                        m_distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator());
-                    else 
-                        m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
+                    m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
                 }
 
                 m_gradHeader->numEvalNode = evalNodes.size();
diff --git a/Source/SGDLib/V2SimpleDistGradAggregator.h b/Source/SGDLib/V2SimpleDistGradAggregator.h
index 1ca3569db..586626e46 100644
--- a/Source/SGDLib/V2SimpleDistGradAggregator.h
+++ b/Source/SGDLib/V2SimpleDistGradAggregator.h
@@ -109,7 +109,7 @@ public:
 
                 // Synchronize the Quantization compute stream with the completion of
                 // compute of the gradient matrices on the main compute stream
-                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
+                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
                 delete mainStreamSyncEvent;
 
                 AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
@@ -185,7 +185,7 @@ private:
             if (m_useAsyncAggregation)
             {
                 std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
-                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
+                mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
             }
         }
 
diff --git a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
index ca6b76f58..9e07b7495 100644
--- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
+++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
@@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion)
     // This is a watch guard to make sure that any change in the model version will be detected. 
     // If you change the CNTK model version, please do not silently adapt this test. 
     // Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release.
-    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 30, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
+    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 31, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
 }
 
 BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)
diff --git a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
index acdb12c04..26397e43a 100644
--- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
+++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
@@ -61,7 +61,7 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;Cntk.SGD-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(BOOST_LIB_PATH);$(NvmlLibPath)</AdditionalLibraryDirectories>
       <DelayLoadDLLs>Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>