Add v2 Learners (standalone)

2016-07-13 09:37:06 +02:00 · 2016-07-13 09:37:06 +02:00 · 1b0548fdde
--- a/2
+++ b/2
@ -375,6 +375,8 @@ CNTKLIBRARY_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
    $(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -285,6 +285,7 @@ namespace CNTK
    class NDArrayView final : public std::enable_shared_from_this<NDArrayView>
    {
        friend class CompositeFunction;
        friend class LearnerBase;
        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
@ -1396,4 +1397,342 @@ namespace CNTK
    /// of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
    ///
    CNTK_API FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name = L"");
    ///
    /// A serializable value represents one of:
    /// a) Boolean
    /// b) Signed long integer
    /// c) Single and double precision floating point values
    /// d) NDShape
    /// e) vector<DictionaryValue>
    ///
    /// TODO:  we need to have native support for DictionaryValue<vector> and DictionaryValue<NDArrayView>.
    class CNTK_API DictionaryValue final
    {
    public:
        enum class Type : unsigned int
        {
            None,
            Bool,
            SizeT,
            Float,
            Double,
            NDShape,
            Vector
        };
        static const char* TypeName(Type type)
        {
            switch (type)
            {
            case Type::None:
                return "None";
            case Type::Bool:
                return "Bool";
            case Type::SizeT:
                return "SizeT";
            case Type::Float:
                return "Float";
            case Type::Double:
                return "Double";
            case Type::NDShape:
                return "NDShape";
            case Type::Vector:
                return "Vector";
            default:
                LogicError("Unknown DictionaryValue::Type");
            }
        }
    public:
        DictionaryValue() : m_valueType(Type::None)
        {
        }
        DictionaryValue(bool value) : m_valueType(GetValueType<bool>())
        {
            m_data.m_boolean = value;
        }
        DictionaryValue(size_t value) : m_valueType(GetValueType<size_t>())
        {
            m_data.m_sizeT = value;
        }
        DictionaryValue(float value) : m_valueType(GetValueType<float>())
        {
            m_data.m_float = value;
        }
        DictionaryValue(double value) : m_valueType(GetValueType<double>())
        {
            m_data.m_double = value;
        }
        template <typename T>
        DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
        {
            static_assert(std::is_same<T, NDShape>::value ||
                          std::is_same<T, std::vector<DictionaryValue>>::value,
                          "Unsupported ValueType");
            AllocateDataPtr(value);
        }
        DictionaryValue(const DictionaryValue& other) : m_valueType(Type::Bool)
        {
            // The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
            // the underlying underlying uninitialized value as a ptr and free it.
            *this = other;
        }
        DictionaryValue& operator=(const DictionaryValue& other)
        {
            if (this != &other)
            {
                FreeDataPtr();
                m_valueType = other.m_valueType;
                m_data = other.m_data;
                if (other.m_valueType == Type::NDShape)
                    AllocateDataPtr(other.GetValue<NDShape>());
                else if (other.m_valueType == Type::Vector)
                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
            }
            return *this;
        }
        ~DictionaryValue()
        {
            FreeDataPtr();
        }
        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_boolean;
        }
        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_sizeT;
        }
        template <typename T, typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_float;
        }
        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_double;
        }
        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return *(reinterpret_cast<T*>(m_data.m_ptr));
        }
        bool HasValue() const
        {
            return m_valueType != Type::None;
        }
        Type ValueType() const
        {
            return m_valueType;
        }
        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
    private:
        template <typename T>
        static Type GetValueType()
        {
            static_assert(std::is_same<T, bool>::value ||
                          std::is_same<T, size_t>::value ||
                          std::is_same<T, float>::value ||
                          std::is_same<T, double>::value ||
                          std::is_same<T, NDShape>::value ||
                          std::is_same<T, std::vector<DictionaryValue>>::value,
                          "Unsupported ValueType");
            if (std::is_same<T, bool>::value)                                      return Type::Bool;
            if (std::is_same<T, size_t>::value)                                    return Type::SizeT;
            if (std::is_same<T, float>::value)                                     return Type::Float;
            if (std::is_same<T, double>::value)                                    return Type::Double;
            if (std::is_same<T, NDShape>::value)                                   return Type::NDShape;
            if (std::is_same<T, std::vector<DictionaryValue>>::value)              return Type::Vector;
        }
        template <typename T>
        void VerifyType() const
        {
            if (GetValueType<T>() != m_valueType)
                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
        }
        template <typename T>
        void AllocateDataPtr(const T& value);
        template <typename T>
        void FreePtrAsType();
        void FreeDataPtr();
        Type m_valueType;
        union ValueData
        {
            bool m_boolean;
            size_t m_sizeT;
            float m_float;
            double m_double;
            void* m_ptr;
        } m_data;
        const size_t version = 1;
    };
    ///
    /// A type denoting a dictionary (keyed by Unicode strings) of serializable values (dynamically typed).
    ///
    class CNTK_API Dictionary final
    {
    public:
        Dictionary();
        ~Dictionary();
        // Disallow copy construction and assignment
        Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
        Dictionary(Dictionary&& other);
        Dictionary& operator=(Dictionary&& other);
        DictionaryValue& operator[](const std::wstring& key)
        {
            return operator[](key.c_str());
        }
        DictionaryValue& operator[](const wchar_t* key);
        DictionaryValue operator[](const std::wstring& key) const
        {
            return operator[](key.c_str());
        }
        DictionaryValue operator[](const wchar_t* key) const;
        bool Contains(const std::wstring& key) const
        {
            return Contains(key.c_str());
        }
        bool Contains(const wchar_t* key) const;
        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
    private:
        std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
        const size_t version = 1;
    };
    ///
    /// Abstraction for learning a subset of parameters of a learnable function using first order gradient values
    /// For e.g momentum, AdaGrad, RMSProp etc. are different types of learners with their own algorithms for
    /// learning parameter values using first order gradients.
    ///
    class Learner : public std::enable_shared_from_this<Learner>
    {
    public:
        //
        // Method to update the parameters associated with this learner. By returning false, this method indicates that
        // learning has stopped for all of the parameters associated with this learner
        //
        CNTK_API virtual bool Update(const std::unordered_map<Variable, ValuePtr>& parameterValues,
                                     const std::unordered_map<Variable, const ValuePtr>& gradientValues,
                                     size_t trainingSampleCount) = 0;
        ///
        /// Returns the set of parameters associated with this learner.
        ///
        const std::unordered_set<Variable>& Parameters() const { return m_parameters; }
        // TODO: move the following two methods into ISerializable interface, make 
        // Learner (and all other entities that need checkpointing capability) implement it.
        ///
        /// Optionally overridable method to checkpoint the learner's state.
        ///
        CNTK_API virtual Dictionary GetCheckpointState() const = 0;
        ///
        /// Optionally overridable method to restore the learner's state from a previous checkpoint.
        ///
        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) = 0;
        virtual ~Learner()
        {
        }
    protected:
        Learner(const std::unordered_set<Variable>& parameters)
            : m_parameters(parameters)
        {
        }
        std::unordered_set<Variable> m_parameters;
    };
    ///
    /// Create an instance of the CNTK built-in SGD learner.
    ///
    /// TODO: add additional SGD parameters here (a collection of learning rate values)
    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Variable>& parameters,
                                   const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    ///
    /// Create an instance of the CNTK built-in Momentum SGD learner.
    ///
    /// TODO: add additional Momentum parameters here (a collection of momentum rate values)
    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Variable>& parameters,
                                           const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    ///
    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
    ///
    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Variable>& parameters,
                                        const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    ///
    /// Create an instance of the CNTK built-in AdaGrad learner.
    ///
    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Variable>& parameters, bool needAveMultiplier = true,
                                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    ///
    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
    ///
    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Variable>& parameters,
                                         const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    ///
    /// Create an instance of the CNTK built-in RMSProp learner.
    ///
    CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Variable>& parameters,
                                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier = true,
                                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
 }
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -47,6 +47,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template <typename ElementType>
    class ComputationNode;
    class File;
 }}}
 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -158,4 +160,7 @@ namespace CNTK
    class Function;
    typedef std::shared_ptr<Function> FunctionPtr;
    class Learner;
    typedef std::shared_ptr<Learner> LearnerPtr;
 }
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -128,6 +128,7 @@
    <ClInclude Include="API\CNTKLibrary.h" />
    <ClInclude Include="API\CNTKLibraryInternals.h" />
    <ClInclude Include="Function.h" />
    <ClInclude Include="Learner.h" />
    <ClInclude Include="Utils.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
@ -140,6 +141,7 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
    <ClCompile Include="Learner.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
    <ClCompile Include="NDMask.cpp" />
    <ClCompile Include="stdafx.cpp">
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -10,6 +10,7 @@
    <ClCompile Include="Variable.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="NDMask.cpp" />
    <ClCompile Include="Learner.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
@ -22,6 +23,7 @@
      <Filter>API</Filter>
    </ClInclude>
    <ClInclude Include="Function.h" />
    <ClInclude Include="Learner.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -0,0 +1,464 @@
 //
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
 #include "Learner.h"
 #include "TensorView.h"
 #include "Utils.h"
 #define UPDATE_FUNCTION                                                                                       \
    switch (smoothedGradientValue->Data()->GetDataType())                                                     \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
        Update<float>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);  \
        break;                                                                                                \
    case DataType::Double:                                                                                    \
        Update<double>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount); \
        break;                                                                                                \
    default:                                                                                                  \
        NOT_IMPLEMENTED;                                                                                      \
    }
 using namespace Microsoft::MSR::CNTK;
 using namespace std;
 namespace CNTK
 {
    template <typename ElementType>
    /*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr arrayView)
    {
        return arrayView->GetMatrix<ElementType>();
    }
    template <typename ElementType>
    /*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(NDArrayViewPtr arrayView)
    {
        return arrayView->GetWritableMatrix<ElementType>();
    }
    template <typename ElementType>
    /*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr arrayView)
    {
        return arrayView->GetTensorView<ElementType>();
    }
    /*static*/ bool LearnerBase::HasNan(const ValuePtr& value, const char* name)
    {
        const auto& data = value->Data();
        switch (data->GetDataType())
        {
        case DataType::Float:
            return data->GetMatrix<float>()->HasNan(name);
        case DataType::Double:
            return data->GetMatrix<double>()->HasNan(name);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
        }
    }
    /*static*/ void LearnerBase::Print(const ValuePtr& value, const char* msg)
    {
        const auto& data = value->Data();
        switch (data->GetDataType())
        {
        case DataType::Float:
            data->GetMatrix<float>()->Print(msg);
            break;
        case DataType::Double:
            data->GetMatrix<double>()->Print(msg);
            break;
        default:
            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
        }
    }
    // Clipping gradients to prevent outliers,
    template <typename ElementType>
    void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
    {
        if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
        {
            double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
            if (m_additionalOptions.gradientClippingWithTruncation)
                gradient.InplaceTruncate(ElementType(maxGradientPerMB));
            else
            {
                // norm2 normalized
                double gradientNorm = gradient.FrobeniusNorm();
                if (gradientNorm > maxGradientPerMB)
                {
                    double normFactor = maxGradientPerMB / gradientNorm;
                    gradient *= ElementType(normFactor);
                }
            }
        }
    }
    // Performs additional preprocessing before calling the update method 
    // (gradient clipping and L2 regularization depending on the additional learning parameters).
    template <typename ElementType>
    void LearnerBase::PreProcess(const ValuePtr& gradientValue,const ValuePtr& parameterValue, size_t actualMBSize) const
    {
        const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
        // clipping gradients to prevent outliers
        ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
        // L2 regularizer
        if (m_additionalOptions.l2RegularizationWeight > 0)
        {
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
            const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
            Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
        }
    }
    // Performs additional postprocessing after the update method has been executed
    // (noise injection and L1 regularization specified by the additional learning parameters).
    template <typename ElementType>
    void LearnerBase::PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
                                    const ValuePtr& parameterValue, size_t actualMBSize) const
    {
        const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
        if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
        {
            const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
            Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
            // get the gradient structure since gradient is sparse
            sgdUpdateNoise.SetValue(*gradientMatrix);
            auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
            // reset its value to random
            sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
            Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
        }
        // L1 regularizer with proximal gradient descent method
        if (m_additionalOptions.l1RegularizationWeight > 0)
        {
            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
            parameterValue->Data()->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
        }
    }
    template <typename ElementType>
    /*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(NDArrayViewPtr arrayView)
    {
        return arrayView->GetWritableTensorView<ElementType>();
    }
    LearnerBase::LearnerBase(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
        : Learner(parameters),
        m_learningRatePerSample(0.0),
        m_sampleCount(0)
    {
        const unordered_set<Variable>& parameterSet = parameters;
        for (const auto& parameter : parameterSet)
        {
            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
            // Should the device be specified on the per-parameter basis?
            NDArrayViewPtr view;
            if (parameter.GetDataType() == DataType::Float)
            {
                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), device);
            }
            else
            {
                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), device);
            }
            m_smoothedGradientValues.insert(make_pair(parameter, MakeSharedObject<Value>(view)));
            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
        }
    }
    void LearnerBase::ResetSmoothedGradients()
    {
        for (const auto& parameter : Parameters())
        {
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            const auto& data = smoothedGradientValue->Data();
            switch (data->GetDataType())
            {
            case DataType::Float:
                data->SetValue(0.0f);
                break;
            case DataType::Double:
                data->SetValue(0.0);
                break;
            default:
                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
            }
        }
    }
    /*virtual*/ bool LearnerBase::Update(const unordered_map<Variable, ValuePtr>& parameterValues,
                                            const unordered_map<Variable, const ValuePtr>& gradientValues,
                                            size_t trainingSampleCount) /*override*/
    {
        // make sure trainingSampleCount is a valid value
        assert(trainingSampleCount > 0);
        for (const auto& parameter : Parameters())
        {
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            const auto& gradientValue = gradientValues.at(parameter);
            const auto& parameterValue = parameterValues.at(parameter);
 // TODO: make this a runtime parameter.
 #if DUMPOUTPUT
            LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
 #endif
 #ifdef _DEBUG
            if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
                LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
 #endif
 #if DUMPOUTPUT
            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
            Print(gradientValue, "Gradient Update");
            Print(smoothedGradientValue, "Smoothed Gradient Input");
 #endif
            UPDATE_FUNCTION;
 #if DUMPOUTPUT
            Print(parameterValue, "Parameter Update");
 #endif
 #ifdef _DEBUG
            if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
                LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
 #endif
        }
        m_sampleCount += trainingSampleCount;
        return false;
    }
    template <typename ElementType>
    void LearnerBase::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                             const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
    {
        PreProcess<ElementType>(gradientValue, parameterValue, trainingSampleCount);
        Update(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);
        PostProcess<ElementType>(parameter, gradientValue, parameterValue, trainingSampleCount);
    }
    string LearnerBase::LearnerType() const
    {
        auto name = typeid(*this).name(); 
        if (strncmp(name, "class ", 6) == 0)
        {
            // On Windows, the type name contains "class" prefix. 
            // Return the actual name, omitting the prefix.
            return &name[6];
        } 
        return name;
    }
    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
    {
        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
        Dictionary checkpoint;
        for (const auto& parameter : Parameters())
        {
            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
            // need to expose "UId" property -- a persistent unique internal name.
            // Switch to UId as soon as it's available.
            if (checkpoint.Contains(parameter.Name()))
            {
                LogicError("Parameter names must be unique");
            }
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            // Potentially, could store things like dimensions, element size, format, etc., but
            // that seems to be redundant, since all of that is passed in the constructor.
            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue->Data());
        }
        return checkpoint;
    }
    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
    {
        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
        for (const auto& parameter : Parameters())
        {
            if (!checkpoint.Contains(parameter.Name()))
            {
                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
            }
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            const DictionaryValue& state = checkpoint[parameter.Name()];
            const auto& data = smoothedGradientValue->Data();
            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
        }
    }
    /*virtual*/ void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                        const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
    {
        UPDATE_FUNCTION;
    }
    template <typename ElementType>
    void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
    {
        UNUSED(trainingSampleCount);
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
    }
    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
        : LearnerBase(parameters, device),
        m_needAveMultiplier(needAveMultiplier)
    {
    }
    /*virtual*/ void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
    {
        UPDATE_FUNCTION;
    }
    template <typename ElementType>
    void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
    {
        UNUSED(trainingSampleCount);
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }
    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
        : LearnerMomentumSGD(parameters, device)
    {
    }
    /*virtual*/ void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
    {
        UPDATE_FUNCTION;
    }
    template <typename ElementType>
    void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                  const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
    {
        UNUSED(trainingSampleCount);
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
                                            learningRate, ElementType(m_momentumPerSample));
    }
    LearnerRMSProp::LearnerRMSProp(const unordered_set<Variable>& parameters,
                                    double gamma, double inc, double dec, double max, double min,
                                    bool needAveMultiplier, const DeviceDescriptor& device)
                                    : LearnerBase(parameters, device),
                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
                                    m_needAveMultiplier(needAveMultiplier)
    {
    }
    /*virtual*/ void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
    {
        UPDATE_FUNCTION;
    }
    template <typename ElementType>
    void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
    {
        UNUSED(trainingSampleCount);
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
                                                                ElementType(m_gamma), ElementType(m_inc),
                                                                ElementType(m_max), ElementType(m_dec),
                                                                ElementType(m_min), m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }
    // Explicit template instantiations
    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr arrayView);
    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr arrayView);
    LearnerPtr SGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerSGD>(parameters, device);
    }
    LearnerPtr MomentumSGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerMomentumSGD>(parameters, device);
    }
    LearnerPtr NesterovLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerNesterov>(parameters, device);
    }
    LearnerPtr AdaGradLearner(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier, device);
    }
    LearnerPtr FSAdaGradLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerFSAdaGrad>(parameters, device);
    }
    LearnerPtr RMSPropLearner(const unordered_set<Variable>& parameters,
                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
                                const DeviceDescriptor& device)
    {
        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier, device);
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -0,0 +1,224 @@
 //
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 namespace CNTK 
 {
    // A collection of additional options that are applicable for all standard learners 
    // (after these options are set, they retain their value for the entire lifespan of a learner).
    struct AdditionalLearningOptions
    {
        double l1RegularizationWeight = 0.0;
        double l2RegularizationWeight = 0.0;
        double gaussianNoiseInjectionStdDev = 0.0;
        bool gradientClippingWithTruncation = false;
        double gradientClippingThresholdPerSample = 0.0;
        std::unordered_map<Variable, double> learningRateMultipliers;
    };
    // An abstract base class at the root of the standard learners hierarchy
    // It implements most of the learner functionality, except for the actual update function,
    // and adds a few pre-/postprocessing methods (which are invoked before and after the update).
    class LearnerBase : public Learner
    {
    public:
        CNTK_API virtual bool Update(const std::unordered_map<Variable, ValuePtr>& parameterValues,
                                     const std::unordered_map<Variable, const ValuePtr>& gradientValues,
                                     size_t trainingSampleCount) override final;
        CNTK_API virtual Dictionary GetCheckpointState() const override;
        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override;
        CNTK_API void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
        {
            m_additionalOptions = additionalOptions;
        }
        // TODO: should this be called ResetMomentum?
        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
        CNTK_API void ResetSmoothedGradients();
        // TODO: move learning rate and momentum scheduling and adjustment functionality 
        // inside the learner and drop these setters.
        void SetLearningRate(double value) { m_learningRatePerSample = value; }
    protected:
        LearnerBase(const std::unordered_set<Variable>& parameters,
                    const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const = 0;
        double ParameterDependentLearningRate(const Variable& parameter) const
        {
            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
        }
        std::string LearnerType() const;
        double m_learningRatePerSample;
        AdditionalLearningOptions m_additionalOptions;
        std::unordered_map<Variable, ValuePtr> m_smoothedGradientValues;
        // The following four static protected methods expose private methods of NDArrayView class
        // (which declares LearnerBase as friend class), so that they are available to subclasses.
        template <typename ElementType>
        static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr arrayView);
        template <typename ElementType>
        static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(NDArrayViewPtr arrayView);
        template <typename ElementType>
        static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr arrayView);
        template <typename ElementType>
        static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(NDArrayViewPtr arrayView);
        template <typename ElementType>
        void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
        // Performs additional preprocessing before calling the update method 
        // (gradient clipping and L2 regularization depending on the additional learning parameters).
        template <typename ElementType>
        void PreProcess(const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t actualMBSize) const;
        // Performs additional postprocessing after the update method has been executed
        // (noise injection and L1 regularization specified by the additional learning parameters).
        template <typename ElementType>
        void PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
                         const ValuePtr& parameterValue, size_t actualMBSize) const;
    private:
        // Templatized update function, it invokes preprocess and postprocess using the provided
        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
        template <typename ElementType>
        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
        // TODO: make these functions friends of NDViewArray and move to Utils?
        static bool HasNan(const ValuePtr& value, const char* name);
        static void Print(const ValuePtr& value, const char* msg);
        size_t m_sampleCount;
    };
    // Vanilla gradient descent optimization algorithm.
    class LearnerSGD : public LearnerBase
    {
    public:
        LearnerSGD(const std::unordered_set<Variable>& parameters,
                   const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
                   : LearnerBase(parameters, device),
                   m_momentumPerSample(0.0),
                   m_useNesterovAcceleration(false)
        {
        }
    protected:
        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
        template <typename ElementType>
        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
        double m_momentumPerSample;
        bool m_useNesterovAcceleration;
    };
    // SGD optimization with momentum. 
    class LearnerMomentumSGD : public LearnerSGD
    {
    public:
        LearnerMomentumSGD(const std::unordered_set<Variable>& parameters,
                           const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
                           : LearnerSGD(parameters, device)
        {
        }
        void SetMomentum(double value) { m_momentumPerSample = value; }
    };
    // Nesterov's accelerated SGDLearnerBase descent. 
    class LearnerNesterov : public LearnerSGD
    {
    public:
        LearnerNesterov(const std::unordered_set<Variable>& parameters,
                        const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
                        : LearnerSGD(parameters, device)
        {
            m_useNesterovAcceleration = true;
        }
    };
    class LearnerAdaGrad : public LearnerBase
    {
    public:
        LearnerAdaGrad(const std::unordered_set<Variable>& parameters, bool needAveMultiplier,
                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    protected:
        bool m_needAveMultiplier;
        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
        template <typename ElementType>
        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
    };
    class LearnerFSAdaGrad : public LearnerMomentumSGD
    {
    public:
        LearnerFSAdaGrad(const std::unordered_set<Variable>& parameters,
                         const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    protected:
        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
        template <typename ElementType>
        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
    };
    class LearnerRMSProp : public LearnerBase
    {
    public:
        LearnerRMSProp(const std::unordered_set<Variable>& parameters,
                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
    protected:
        double m_gamma;
        double m_inc;
        double m_dec;
        double m_max;
        double m_min;
        bool m_needAveMultiplier;
        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
        template <typename ElementType>
        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
    };
 }
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -338,8 +338,10 @@ namespace CNTK
    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
-    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
-    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,11 +6,138 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
 #include "File.h"
 using namespace std;
 namespace CNTK
 {
    template <typename T>
    void DictionaryValue::AllocateDataPtr(const T& value)
    {
        static_assert(is_same<T, NDShape>::value || is_same<T, vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
        m_data.m_ptr = new T(value);
    }
    template <typename T>
    void DictionaryValue::FreePtrAsType()
    {
        T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
        delete typedPtr;
        m_data.m_ptr = nullptr;
    }
    void DictionaryValue::FreeDataPtr()
    {
        if (m_valueType == Type::NDShape)
            FreePtrAsType<NDShape>();
        else if (m_valueType == Type::Vector)
            FreePtrAsType<vector<DictionaryValue>>();
    }
    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
    {
        size_t version;
        stream >> version;
        stream >> us.m_valueType;
        switch (us.ValueType())
        {
        case DictionaryValue::Type::Bool:
            stream >> us.m_data.m_boolean;
            break;
        case DictionaryValue::Type::SizeT:
            stream >> us.m_data.m_sizeT;
            break;
        case DictionaryValue::Type::Float:
            stream >> us.m_data.m_float;
            break;
        case DictionaryValue::Type::Double:
            stream >> us.m_data.m_double;
            break;
        case DictionaryValue::Type::NDShape:
        {
            size_t size;
            stream >> size;
            vector<size_t> dims(size);
            for (auto i = 0; i < size; i++)
            {
                stream >> dims[i];
            }
            us.AllocateDataPtr(NDShape(dims));
            break;
        }
        case DictionaryValue::Type::Vector:
        {
            size_t size;
            stream >> size;
            vector<DictionaryValue> values(size);
            for (auto i = 0; i < size; i++)
            {
                stream >> values[i];
            }
            us.AllocateDataPtr(values);
            break;
        }
        default:
            NOT_IMPLEMENTED;
        }
        return stream;
    }
    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
    {
        stream << us.version;
        stream << us.ValueType();
        switch (us.ValueType())
        {
        case DictionaryValue::Type::Bool:
            stream << us.m_data.m_boolean;
            break;
        case DictionaryValue::Type::SizeT:
            stream << us.m_data.m_sizeT;
            break;
        case DictionaryValue::Type::Float:
            stream << us.m_data.m_float;
            break;
        case DictionaryValue::Type::Double:
            stream << us.m_data.m_double;
            break;
        case DictionaryValue::Type::NDShape:
        {
            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
            auto size = shapePtr->NumAxes();
            stream << size;
            for (auto i = 0; i < size; i++)
            {
                stream << shapePtr->operator[](i);
            }
            break;
        }
        case DictionaryValue::Type::Vector:
        {
            vector<DictionaryValue>* vectorPtr =
                reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
            auto size = vectorPtr->size();
            stream << size;
            for (auto i = 0; i < size; i++)
            {
                stream << vectorPtr->operator[](i);
            }
            break;
        }
        default:
            NOT_IMPLEMENTED;
        }
        return stream;
    }
    Dictionary::Dictionary()
-        : m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
+        : m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
    {
    }
@ -22,7 +149,7 @@ namespace CNTK
    Dictionary::Dictionary(Dictionary&& other)
        : m_dictionaryData(nullptr)
    {
-        *this = std::move(other);
+        *this = move(other);
    }
    Dictionary& Dictionary::operator=(Dictionary&& other)
@ -51,4 +178,130 @@ namespace CNTK
    {
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }
    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
    {
        stream << us.version;
        stream << us.m_dictionaryData->size();
        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
        {
            stream << it->first;
            stream << it->second;
        }
        return stream;
    }
    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
    {
        size_t version;
        stream >> version;
        size_t size;
        stream >> size;
        us.m_dictionaryData->reserve(size);
        for (auto i = 0; i < size; i++)
        {
            wstring key;
            stream >> key;
            DictionaryValue value;
            stream >> value;
            us.m_dictionaryData->insert(make_pair(key, value));
        }
        return stream;
    }
    template <typename T>
    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
    {
        if (viewPtr->IsSparse())
        {
            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
        }
        auto numElements = viewPtr->Shape().TotalSize();
        vector<DictionaryValue> values(numElements);
        NDArrayViewPtr cpuDataViewPtr = viewPtr;
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
            cpuDataViewPtr->CopyFrom(*viewPtr);
        }
        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            T v = buffer[i];
            values[i] = DictionaryValue(v);
        }
        return values;
    }
    template <typename T>
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
    {
        if (viewPtr->IsSparse())
        {
            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
        }
        auto numElements = viewPtr->Shape().TotalSize();
        if (values.size() != numElements)
        {
            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
                        values.size(), numElements);
        }
        NDArrayViewPtr cpuDataViewPtr = viewPtr;
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
        }
        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            buffer[i] = values[i].GetValue<T>();
        }
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            viewPtr->CopyFrom(*cpuDataViewPtr);
        }
    }
    // TODO: we store the type info for every element in the vector, which is extremely redundant.
    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
    {
        switch (viewPtr->GetDataType())
        {
        case DataType::Float:
            return SerializeToVector<float>(viewPtr);
        case DataType::Double:
            return SerializeToVector<double>(viewPtr);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
        }
    }
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
    {
        switch (viewPtr->GetDataType())
        {
        case DataType::Float:
            DeserializeFromVector<float>(viewPtr, values);
            break;
        case DataType::Double:
            DeserializeFromVector<double>(viewPtr, values);
            break;
        default:
            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
        }
    }
    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -15,245 +15,6 @@ namespace CNTK
    // Forward declarations
    class Dictionary;
    class DictionaryValue
    {
    public:
        enum class Type : unsigned int
        {
            None,
            Bool,
            SizeT,
            Double,
            NDShape,
            Vector
        };
        static const char* TypeName(Type type)
        {
            if (type == Type::None)
                return "None";
            else if (type == Type::Bool)
                return "Bool";
            else if (type == Type::SizeT)
                return "SizeT";
            else if (type == Type::Double)
                return "Double";
            else if (type == Type::NDShape)
                return "NDShape";
            else if (type == Type::Vector)
                return "Vector";
            else
                LogicError("Unknown DictionaryValue::Type");
        }
    public:
        DictionaryValue()
            : m_valueType(Type::None)
        {
        }
        DictionaryValue(bool value)
            : m_valueType(GetValueType<bool>())
        {
            m_data.m_boolean = value;
        }
        DictionaryValue(size_t value)
            : m_valueType(GetValueType<size_t>())
        {
            m_data.m_sizeT = value;
        }
        DictionaryValue(double value)
            : m_valueType(GetValueType<double>())
        {
            m_data.m_double = value;
        }
        template <typename T>
        DictionaryValue(const T& value)
            : m_valueType(GetValueType<T>())
        {
            static_assert(std::is_same<T, NDShape>::value ||
                std::is_same<T, std::vector<DictionaryValue>>::value,
                "Unsupported ValueType");
            AllocateDataPtr(value);
        }
        DictionaryValue(const DictionaryValue& other)
            : m_valueType(Type::Bool)
        {
            // The m_valueType must hvae been set to a non-ptr type to prevent an attempt to interpret
            // the underlying underlying uninitialized value as a ptr and free it.
            *this = other;
        }
        DictionaryValue& operator=(const DictionaryValue& other)
        {
            if (this != &other)
            {
                FreeDataPtr();
                m_valueType = other.m_valueType;
                m_data = other.m_data;
                if (other.m_valueType == Type::NDShape)
                    AllocateDataPtr(other.GetValue<NDShape>());
                else if (other.m_valueType == Type::Vector)
                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
            }
            return *this;
        }
        ~DictionaryValue()
        {
            FreeDataPtr();
        }
        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_boolean;
        }
        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_sizeT;
        }
        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return m_data.m_double;
        }
        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
            return *(reinterpret_cast<T*>(m_data.m_ptr));
        }
        bool HasValue() const
        {
            return m_valueType != Type::None;
        }
        Type ValueType() const
        {
            return m_valueType;
        }
    private:
        template <typename T>
        static Type GetValueType()
        {
            static_assert(std::is_same<T, bool>::value ||
                std::is_same<T, size_t>::value ||
                std::is_same<T, double>::value ||
                std::is_same<T, NDShape>::value ||
                std::is_same<T, std::vector<DictionaryValue>>::value ||
                std::is_same<T, CNTK::Dictionary>::value,
                "Unsupported ValueType");
            if (std::is_same<T, bool>::value)
                return Type::Bool;
            else if (std::is_same<T, size_t>::value)
                return Type::SizeT;
            else if (std::is_same<T, double>::value)
                return Type::Double;
            else if (std::is_same<T, NDShape>::value)
                return Type::NDShape;
            else if (std::is_same<T, std::vector<DictionaryValue>>::value)
                return Type::Vector;
        }
        template <typename T>
        void VerifyType() const
        {
            if (GetValueType<T>() != m_valueType)
                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
        }
        template <typename T>
        void AllocateDataPtr(const T& value)
        {
            static_assert(std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
            m_data.m_ptr = new T(value);
        }
        template <typename T>
        void FreePtrAsType()
        {
            T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
            delete typedPtr;
            m_data.m_ptr = nullptr;
        }
        void FreeDataPtr()
        {
            if (m_valueType == Type::NDShape)
                FreePtrAsType<NDShape>();
            else if (m_valueType == Type::Vector)
                FreePtrAsType<std::vector<DictionaryValue>>();
        }
    private:
        Type m_valueType;
        union ValueData
        {
            bool m_boolean;
            size_t m_sizeT;
            double m_double;
            void* m_ptr;
        } m_data;
    };
    class Dictionary
    {
    public:
        Dictionary();
        ~Dictionary();
        // Disallow copy contruction and assignment
        Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
        Dictionary(Dictionary&& other);
        Dictionary& operator=(Dictionary&& other);
        DictionaryValue& operator[](const std::wstring& key)
        {
            return operator[](key.c_str());
        }
        DictionaryValue& operator[](const wchar_t* key);
        DictionaryValue operator[](const std::wstring& key) const
        {
            return operator[](key.c_str());
        }
        DictionaryValue operator[](const wchar_t* key) const;
        bool Contains(const std::wstring& key) const
        {
            return Contains(key.c_str());
        }
        bool Contains(const wchar_t* key) const;
    private:
        std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
    };
    // Helper to get the size of an element of the specified DataType
    inline size_t ElementSize(DataType dataType)
    {
@ -363,4 +124,8 @@ namespace CNTK
    {
        return var.IsInput() && var.IsSparse();
    }
    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
 }