Improve V2 Learner API, add basic serialization.

FSAdaGrad, RMSProp: pre-allocate smoothed gradients with expected number of columns. Improve configuring built-in learners: add learning rates and momentums schedules. Add NDArrayView as a DictionaryValue type. Add tests for serialization and basic learner functionality.
2016-08-18 14:33:38 +02:00 · 2016-08-18 14:33:38 +02:00 · 9bd9308d2e
--- a/2
+++ b/2
@ -409,6 +409,8 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
 	Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \
+    Tests/UnitTests/V2LibraryTests/SerializationTests.cpp \
+    Tests/UnitTests/V2LibraryTests/LearnerTests.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -20,10 +20,12 @@
 #include <array>
 #include <stdarg.h>
 #include <assert.h>
+#include <map>
 #include <unordered_map>
 #include <unordered_set>
 #include <string>
 #include <sstream>
+#include <iosfwd>
 #include<algorithm>

 namespace CNTK
@ -242,7 +244,7 @@ namespace CNTK
        }

        ///
-        /// Creates and returns a new shape contructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
+        /// Creates and returns a new shape constructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
        ///
        NDShape AppendShape(const NDShape& shape) const
        {
@ -1645,6 +1647,7 @@ namespace CNTK
            NDShape,
            Vector,
            Dictionary,
+            NDArrayView,
        };

        static const char* TypeName(Type type)
@ -1669,6 +1672,8 @@ namespace CNTK
                return "Vector";
            case Type::Dictionary:
                return "Dictionary";
+            case Type::NDArrayView:
+                return "NDArrayView";
            default:
                LogicError("Unknown DictionaryValue::Type");
            }
@ -1715,8 +1720,9 @@ namespace CNTK
            static_assert((std::is_same<T, NDShape>::value ||
                std::is_same<T, std::wstring>::value ||
                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                std::is_same<T, Dictionary>::value),
-                          "Unsupported ValueType");
+                std::is_same<T, Dictionary>::value ||
+                std::is_same<T, NDArrayView>::value),
+                "Unsupported ValueType");

            AllocateDataPtr(value);
        }
@ -1728,6 +1734,13 @@ namespace CNTK
            *this = other;
        }

+        DictionaryValue(DictionaryValue&& other) : m_valueType(Type::Bool)
+        {
+            // The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
+            // the underlying underlying uninitialized value as a ptr and free it.
+            *this = std::move(other);
+        }
+
        DictionaryValue& operator=(const DictionaryValue& other)
        {
            if (this != &other)
@ -1745,11 +1758,34 @@ namespace CNTK
                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
                else if (other.m_valueType == Type::Dictionary)
                    AllocateDataPtr(other.GetValue<Dictionary>());
+                else if (other.m_valueType == Type::NDArrayView)
+                    AllocateDataPtr(other.GetValue<NDArrayView>());
            }

            return *this;
        }

+        DictionaryValue& operator=(DictionaryValue&& other)
+        {
+            FreeDataPtr();
+
+            m_valueType = other.m_valueType;
+            m_data = other.m_data;
+
+            if (other.m_valueType == Type::String ||
+                other.m_valueType == Type::NDShape ||
+                other.m_valueType == Type::Vector ||
+                other.m_valueType == Type::Dictionary ||
+                other.m_valueType == Type::NDArrayView)
+            {
+                other.m_data.m_ptr = nullptr;
+            }
+
+            other.m_valueType = Type::None;
+
+            return *this;
+        }
+
        ~DictionaryValue()
        {
            FreeDataPtr();
@ -1786,7 +1822,8 @@ namespace CNTK
        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
            std::is_same<T, std::wstring>::value ||
            std::is_same<T, std::vector<DictionaryValue>>::value ||
-            std::is_same<T, Dictionary>::value>::type* = nullptr>
+            std::is_same<T, Dictionary>::value ||
+            std::is_same<T, NDArrayView>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
@ -1803,8 +1840,11 @@ namespace CNTK
            return m_valueType;
        }

-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
+        CNTK_API bool operator==(const DictionaryValue& other) const;
+        CNTK_API bool operator!=(const DictionaryValue& other) const;
+
+        friend CNTK_API std::istream& operator>>(std::istream& stream, DictionaryValue& us);
+        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const DictionaryValue& us);

    private:
        template <typename T>
@ -1816,8 +1856,9 @@ namespace CNTK
                          std::is_same<T, double>::value ||
                          std::is_same<T, std::wstring>::value ||
                          std::is_same<T, NDShape>::value ||
-                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                std::is_same<T, Dictionary>::value),
+                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value ||
+                          std::is_same<T, NDArrayView>::value),
                          "Unsupported ValueType");

            if (std::is_same<T, bool>::value)                                      return Type::Bool;
@ -1828,6 +1869,7 @@ namespace CNTK
            if (std::is_same<T, NDShape>::value)                                   return Type::NDShape;
            if (std::is_same<T, std::vector<DictionaryValue>>::value)              return Type::Vector;
            if (std::is_same<T, Dictionary>::value)                                return Type::Dictionary;
+            if (std::is_same<T, NDArrayView>::value)                               return Type::NDArrayView;
        }

        template <typename T>
@ -1853,6 +1895,8 @@ namespace CNTK
                FreePtrAsType<std::vector<DictionaryValue>>();
            else if (m_valueType == Type::Dictionary)
                FreePtrAsType<Dictionary>();
+            else if (m_valueType == Type::Dictionary)
+                FreePtrAsType<NDArrayView>();
        }

        Type m_valueType;
@ -1906,9 +1950,11 @@ namespace CNTK
            return Contains(key.c_str());
        }

+        CNTK_API bool operator==(const Dictionary& other) const;
+        CNTK_API bool operator!=(const Dictionary& other) const;

-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
+        friend CNTK_API std::istream& operator>>(std::istream& stream, Dictionary& us);
+        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const Dictionary& us);

    private:
        std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
@ -1957,37 +2003,127 @@ namespace CNTK

    };

+    ///
+    /// A collection of key-value pairs that represents training parameter schedule in 
+    /// terms of the number of processed samples. 
+    /// This class provides a number of convenience constructors to allow easy conversion 
+    /// from a single value, a vector of values and a list of pairs to the training schedule.
+    ///
+    template <typename T>
+    class TrainingParameterSchedule
+    {
+    public:
+        ///
+        /// Create a schedule with a constant parameter value.
+        ///
+        TrainingParameterSchedule(T value)
+            : m_schedule({ std::make_pair(0, value) }), m_unit(1)
+        {}
+
+        ///
+        /// Create a schedule where the parameter changes its value every 'unit' samples:
+        /// schedule[0] is used for the first 'unit' samples, schedule[1] -- for the second,
+        /// and so on. The last value is then used repeatedly until the end of training.
+        ///
+        TrainingParameterSchedule(const std::vector<T>& schedule, size_t unit = 1) 
+            : m_unit(unit)
+        {
+            // TODO: 0 will be used to mean "the entire sweep"
+            if (unit == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
+
+            if (schedule.size() == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
+
+            size_t i = 1;
+            for (const auto& value : schedule)
+            {
+                m_schedule[m_unit * i++] = value;
+            }
+        }
+
+        ///
+        /// Create a schedule using the list of key-value pairs, where the key specifies 
+        /// the number of 'units' the parameter should maintain the corresponding value.
+        /// The value from the last pair is used repeatedly until the end of training.
+        /// For example, {{1, 0.05}, {2, 0.1}, {1, 0.005}} and unit = 100, corresponds to 
+        /// a schedule where the value of '0.05' is used for the first 100 samples, then
+        /// '0.1' is used for the second 200 samples, after which the values is switched
+        /// to '0.005'.
+        ///
+        TrainingParameterSchedule(const std::initializer_list<std::pair<const size_t, T>>& schedule, size_t unit = 1)
+            : m_unit(unit)
+        {
+            // TODO: 0 will be used to mean "the entire sweep"
+            if (unit == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
+
+            if (schedule.size() == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
+
+            size_t i = 0;
+            for (const auto& it : schedule)
+            {
+                if (it.first == 0)
+                    RuntimeError("TrainingParameterSchedule::constructor : unit count cannot be 0.");
+
+                i += it.first;
+                m_schedule[m_unit * i] = it.second;
+            }
+        }
+
+        ///
+        /// Returns a value corresponding to the absolute sample count from the beginning of training.
+        ///
+        CNTK_API const T& operator[](size_t samleCount) const;
+
+    private:
+        std::map<size_t, T> m_schedule;
+        size_t m_unit;
+    };
+
+    typedef TrainingParameterSchedule<double> LearningRatesPerSample;
+    typedef TrainingParameterSchedule<double> MomentumsPerSample;
+
    ///
    /// Create an instance of the CNTK built-in SGD learner.
    ///
-    /// TODO: add additional SGD parameters here (a collection of learning rate values)
-    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
+    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, 
+                                   const LearningRatesPerSample& learningRates);

    ///
    /// Create an instance of the CNTK built-in Momentum SGD learner.
    ///
-    /// TODO: add additional Momentum parameters here (a collection of momentum rate values)
-    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters, 
+                                           const LearningRatesPerSample& learningRates,
+                                           const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
    ///
-    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters, 
+                                        const LearningRatesPerSample& learningRates,
+                                        const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in AdaGrad learner.
    ///
-    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
+    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters,
+                                       const LearningRatesPerSample& learningRates,
+                                       bool needAveMultiplier = true);

    ///
    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
    ///
-    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters,
+                                         const LearningRatesPerSample& learningRates,
+                                         const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in RMSProp learner.
    ///
    CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
+                                       const LearningRatesPerSample& learningRates,
                                       double gamma,
                                       double inc,
                                       double dec,
@ -1997,7 +2133,7 @@ namespace CNTK

    ///
    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
-    /// using the specified learners and training data either explicilty supplied as Value objects or from
+    /// using the specified learners and training data either explicitly supplied as Value objects or from
    /// a MinibatchSource object.
    ///
    class Trainer
@ -2085,7 +2221,7 @@ namespace CNTK
    };

    ///
-    /// Abstraction for generating minbatches of samples for training/evaluation.
+    /// Abstraction for generating minibatches of samples for training/evaluation.
    ///
    class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
    {
@ -2101,7 +2237,7 @@ namespace CNTK
        /// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
        /// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the 
        /// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
-        /// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
+        /// The return value indicates if the MinibatchSource will return any further data in subsequent calls of this function.
        ///
        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -53,8 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template <typename ElementType>
    class ComputationNode;
-
-    class File;
 }}}

 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -139,7 +137,7 @@ namespace CNTK
 #define NOT_IMPLEMENTED                                                                                                              \
    {                                                                                                                                \
        fprintf(stderr, "Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
-        LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
+        CNTK::LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
    }
 #endif
 }
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -8,19 +8,18 @@
 #include "Utils.h"

 #define UPDATE_FUNCTION                                                                                       \
-    switch (smoothedGradientValue->GetDataType())                                                     \
+    switch (smoothedGradientValue->GetDataType())                                                             \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
-        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);  \
+        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                  \
        break;                                                                                                \
    case DataType::Double:                                                                                    \
-        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
+        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                 \
        break;                                                                                                \
    default:                                                                                                  \
        NOT_IMPLEMENTED;                                                                                      \
    }

-
 using namespace Microsoft::MSR::CNTK;
 using namespace std;

@ -141,7 +140,7 @@ namespace CNTK
        // L1 regularizer with proximal gradient descent method
        if (m_additionalOptions.l1RegularizationWeight > 0)
        {
-            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
            parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
@ -154,48 +153,49 @@ namespace CNTK
        return arrayView->GetWritableTensorView<ElementType>();
    }

-    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
+    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters, 
+                             const LearningRatesPerSample& learningRates,
+                             bool allocateSmoothGradients /* = true */)
        : Learner(parameters),
-        m_learningRatePerSample(0.0),
-        m_sampleCount(0)
+        m_learningRates(learningRates),
+        m_sampleCount(0),
+        m_minibatchCount(0)
    {
-        const unordered_set<Parameter>& parameterSet = parameters;
-        for (const auto& parameter : parameterSet)
+        for (const auto& parameter : parameters)
        {
-            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
-            // Should the device be specified on the per-parameter basis?
-            NDArrayViewPtr view;
-            if (parameter.GetDataType() == DataType::Float)
+            if (!allocateSmoothGradients)
            {
-                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
+                continue;
            }
-            else
-            {
-                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
-            }
-
+                
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
            m_smoothedGradientValues.insert(make_pair(parameter, view));
-            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
        }
    }

-    void LearnerBase::ResetSmoothedGradients()
+    /*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape) 
    {
-        for (const auto& parameter : Parameters())
+        if (parameter.GetDataType() == DataType::Float)
        {
-            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
-            const auto& data = smoothedGradientValue;
-            switch (data->GetDataType())
-            {
-            case DataType::Float:
-                data->SetValue(0.0f);
-                break;
-            case DataType::Double:
-                data->SetValue(0.0);
-                break;
-            default:
-                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
-            }
+            return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
+        }
+        else
+        {
+            return MakeSharedObject<NDArrayView>(0.0, shape, parameter.Value()->Device());
+        }
+    }
+
+    /*static*/ NDShape LearnerBase::GetMatrixShape(const Parameter& parameter)
+    {
+        if (parameter.GetDataType() == DataType::Float)
+        {
+           auto matrix = GetMatrix<float>(parameter.Value());
+           return { matrix->GetNumRows(), matrix->GetNumCols() };
+        }
+        else
+        {
+           auto matrix = GetMatrix<double>(parameter.Value());
+           return { matrix->GetNumRows(), matrix->GetNumCols() };
        }
    }

@ -219,17 +219,19 @@ namespace CNTK
 #endif

 #if DUMPOUTPUT
+            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+            auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
-                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+                        learningRate, momentum, trainingSampleCount);
            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
-                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+                      LearnerType().c_str(), m_additionalOptions.gaussianNoiseInjectionStdDev);
            Print(gradientValue, "Gradient Update");
            Print(smoothedGradientValue, "Smoothed Gradient Input");
 #endif
            UPDATE_FUNCTION;

 #if DUMPOUTPUT
-            Print(parameterValue, "Parameter Update");
+            Print(parameter.Value(), "Parameter Update");
 #endif

 #ifdef _DEBUG
@ -239,6 +241,7 @@ namespace CNTK
 #endif
        }
        m_sampleCount += trainingSampleCount;
+        m_minibatchCount++;
        return false;
    }

@ -265,9 +268,16 @@ namespace CNTK

    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
    {
-        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
        Dictionary checkpoint;

+        checkpoint[L"checkpointVersion"] = checkpointVersion;
+        checkpoint[L"sampleCount"] = m_sampleCount;
+        checkpoint[L"minibatchCount"] = m_minibatchCount;
+
+        // TODO: should we also save learning rate schedule into the checkpoint?
+        // If that is the case, need to be able to override this method in subclasses
+        // and save momentum schedule as well.
+
        for (const auto& parameter : Parameters())
        {
            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
@ -277,31 +287,48 @@ namespace CNTK
            {
                LogicError("Parameter names must be unique");
            }
-            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);

-            // Potentially, could store things like dimensions, element size, format, etc., but
-            // that seems to be redundant, since all of that is passed in the constructor.
-            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            checkpoint[parameter.Name()] = *smoothedGradientValue;
        }
        return checkpoint;
    }

    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
    {
-        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        m_sampleCount = checkpoint[L"sampleCount"].GetValue<size_t>();
+        m_minibatchCount = checkpoint[L"minibatchCount"].GetValue<size_t>();
+
+        size_t version = checkpoint[L"minibatchCount"].GetValue<size_t>();
+        if (checkpointVersion != version)
+        {
+            // At the moment, we only support one version, so this should never happen.
+            LogicError("Unsupported checkpoint version.");
+        }
+
        for (const auto& parameter : Parameters())
        {
            if (!checkpoint.Contains(parameter.Name()))
            {
                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
            }
+
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const NDArrayView& checkpointedValue = checkpoint[parameter.Name()].GetValue<NDArrayView>();
+            
+            if (smoothedGradientValue->GetDataType() != checkpointedValue.GetDataType())
+            {
+                LogicError("A value restored from a checkpoint for the smoothed gradient data type for parameter %ls does not match the expected value",
+                           parameter.Name().c_str());
+            }

-            const DictionaryValue& state = checkpoint[parameter.Name()];
+            if (smoothedGradientValue->Shape() != checkpointedValue.Shape())
+            {
+                LogicError("A value restored from a checkpoint for the smoothed gradient shape for parameter %ls does not match the expected value",
+                           parameter.Name().c_str());
+            }

-            const auto& data = smoothedGradientValue;
-
-            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
+            smoothedGradientValue->CopyFrom(checkpointedValue);
        }
    }

@ -313,23 +340,25 @@ namespace CNTK
    template <typename ElementType>
    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        UNUSED(trainingSampleCount);
-
        const auto& parameterValue = parameter.Value();
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));

        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
-                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+                                           learningRate, momentum, m_useNesterovAcceleration);
    }

-    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
-        : LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, 
+                                   const LearningRatesPerSample& learningRates,
+                                   bool needAveMultiplier)
+        : LearnerBase(parameters, learningRates), 
+        m_needAveMultiplier(needAveMultiplier)
    {
    }

@ -348,15 +377,23 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);

        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }

-    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
-        : LearnerMomentumSGD(parameters)
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters, 
+                                       const LearningRatesPerSample& learningRates, 
+                                       const MomentumsPerSample& momentums)
+        : LearnerMomentumSGD(parameters, learningRates, momentums, /*allocateSmoothGradients*/ false)
    {
+        for (const auto& parameter : parameters)
+        {  
+            auto shape = GetMatrixShape(parameter);
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
+            m_smoothedGradientValues.insert(make_pair(parameter, view));
+        }
    }

    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -373,21 +410,33 @@ namespace CNTK
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-
-        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
-
-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
-
-        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
-                                            learningRate, ElementType(m_momentumPerSample));
+        
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix, learningRate, momentum);
    }

-    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
-                                    double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
-                                    : LearnerBase(parameters),
-                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
-                                    m_needAveMultiplier(needAveMultiplier)
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
+                                   double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+                                   : LearnerBase(parameters, learningRates, /*allocateSmoothGradients*/ false),
+                                   m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                   m_needAveMultiplier(needAveMultiplier)
    {
+        for (const auto& parameter : parameters)
+        {  
+            // When needAveMultiplier == true, CPU and GPU implementations of RMSProp require different number of columns.
+            // TODO: verify that this is correct.
+            size_t factor = 3;
+            if (needAveMultiplier && parameter.Value()->Device().Type() == DeviceKind::GPU)
+            {
+                factor = 4;
+            }
+
+            auto shape = GetMatrixShape(parameter);
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], factor * shape[1]});
+
+            m_smoothedGradientValues.insert(make_pair(parameter, view));
+        }
    }

    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -405,12 +454,12 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);

        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
-                                                                ElementType(m_gamma), ElementType(m_inc),
-                                                                ElementType(m_max), ElementType(m_dec),
-                                                                ElementType(m_min), m_needAveMultiplier);
+                                                             ElementType(m_gamma), ElementType(m_inc),
+                                                             ElementType(m_max), ElementType(m_dec),
+                                                             ElementType(m_min), m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }

@ -418,34 +467,35 @@ namespace CNTK
    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
    
-    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
+    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates)
    {
-        return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
+        return MakeSharedObject<LearnerSGD>(parameters, learningRates);
    }

-    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRates, momentums);
    }

-    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerNesterov>(parameters);
+        return MakeSharedObject<LearnerNesterov>(parameters, learningRates, momentums);
    }

-    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
+        return MakeSharedObject<LearnerAdaGrad>(parameters, learningRates, needAveMultiplier);
    }

-    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters);
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRates, momentums);
    }

-    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
-                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
+                              double gamma, double inc, double dec, double max, double min, 
+                              bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
+        return MakeSharedObject<LearnerRMSProp>(parameters, learningRates, gamma, inc, dec, max, min, needAveMultiplier);
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -9,6 +9,7 @@

 namespace CNTK 
 {
+    // TODO: Move this to Trainer along with Pre-, PostProcess and ClipGradient.
    // A collection of additional options that are applicable for all standard learners 
    // (after these options are set, they retain their value for the entire lifespan of a learner).
    struct AdditionalLearningOptions
@ -18,7 +19,6 @@ namespace CNTK
        double gaussianNoiseInjectionStdDev = 0.0;
        bool gradientClippingWithTruncation = true;
        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
-        std::unordered_map<Parameter, double> learningRateMultipliers;
    };

    // An abstract base class at the root of the standard learners hierarchy
@ -33,32 +33,16 @@ namespace CNTK

        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;

-        void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
-        {
-            m_additionalOptions = additionalOptions;
-        }
-
-        // TODO: should this be called ResetMomentum?
-        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
-        void ResetSmoothedGradients();
-
-        // TODO: move learning rate and momentum scheduling and adjustment functionality 
-        // inside the learner and drop these setters.
-        void SetLearningRate(double value) { m_learningRatePerSample = value; }
-
    protected:
-        LearnerBase(const std::unordered_set<Parameter>& parameters);
+        LearnerBase(const std::unordered_set<Parameter>& parameters, 
+                    const LearningRatesPerSample& learningRates,
+                    bool allocateSmoothGradients = true);

        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;

-        double ParameterDependentLearningRate(const Parameter& parameter) const
-        {
-            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
-        }
-
        std::string LearnerType() const;

-        double m_learningRatePerSample;
+        LearningRatesPerSample m_learningRates;

        AdditionalLearningOptions m_additionalOptions;

@ -91,6 +75,16 @@ namespace CNTK
        template <typename ElementType>
        void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;

+        // Returns an NDArrayView with the required shape, with the same data type as parameter value
+        // and allocated on the same device.
+        static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
+
+        // Retrieves the shape of the matrix corresponding to the parameter value.
+        static NDShape GetMatrixShape(const Parameter& parameter);
+
+        size_t m_sampleCount;
+        size_t m_minibatchCount;
+
    private:
        // Templatized update function, it invokes preprocess and postprocess using the provided
        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
@ -101,18 +95,20 @@ namespace CNTK
        static bool HasNan(const NDArrayViewPtr& value, const char* name);
        static void Print(const NDArrayViewPtr& value, const char* msg);

-        size_t m_sampleCount;
+        static const size_t checkpointVersion = 1;
    };

    // Vanilla gradient descent optimization algorithm.
    class LearnerSGD : public LearnerBase
    {
    public:
-        LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
-            : LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
-        {
-            SetLearningRate(learningRatePerSample);
-        }
+        LearnerSGD(const std::unordered_set<Parameter>& parameters, 
+                   const LearningRatesPerSample& learningRates, 
+                   bool allocateSmoothGradients = true)
+            : LearnerBase(parameters, learningRates, allocateSmoothGradients), 
+            m_momentums(0.0), 
+            m_useNesterovAcceleration(false)
+        { }

    protected:

@ -121,7 +117,8 @@ namespace CNTK
        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;

-        double m_momentumPerSample;
+        // TODO: Move m_momentums to LearnerMomentumSGD as soon as NormalGrad is refactored.
+        MomentumsPerSample m_momentums;
        bool m_useNesterovAcceleration;
    };

@ -129,20 +126,25 @@ namespace CNTK
    class LearnerMomentumSGD : public LearnerSGD
    {
    public:
-        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
-            : LearnerSGD(parameters)
-        {}
-
-        void SetMomentum(double value) { m_momentumPerSample = value; }
+        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters, 
+                           const LearningRatesPerSample& learningRates,
+                           const MomentumsPerSample& momentums,
+                           bool allocateSmoothGradients = true)
+            : LearnerSGD(parameters, learningRates, allocateSmoothGradients)
+        {
+            m_momentums = momentums;
+        }
    };

    // Nesterov's accelerated SGDLearnerBase descent. 
-    class LearnerNesterov : public LearnerSGD
+    class LearnerNesterov : public LearnerMomentumSGD
    {
    public:

-        LearnerNesterov(const std::unordered_set<Parameter>& parameters)
-            : LearnerSGD(parameters)
+        LearnerNesterov(const std::unordered_set<Parameter>& parameters, 
+                        const LearningRatesPerSample& learningRates,
+                        const MomentumsPerSample& momentums)
+            : LearnerMomentumSGD(parameters, learningRates, momentums)
        {
            m_useNesterovAcceleration = true;
        }
@ -152,7 +154,9 @@ namespace CNTK
    {
    public:

-        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
+        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, 
+                       const LearningRatesPerSample& learningRates,
+                       bool needAveMultiplier);

    protected:
        bool m_needAveMultiplier;
@ -167,7 +171,9 @@ namespace CNTK
    {
    public:

-        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
+        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters,
+                         const LearningRatesPerSample& learningRates,
+                         const MomentumsPerSample& momentums);

    protected:

@ -182,7 +188,9 @@ namespace CNTK
    public:

        LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
-                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
+                       const LearningRatesPerSample& learningRates,
+                       double gamma, double inc, double dec, double max, double min,
+                       bool needAveMultiplier);

    protected:

--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,20 +6,100 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
-#include "File.h"
+#include <istream>
+#include <ostream>

 using namespace std;

 namespace CNTK
 {
+    // This wrapper redefines operator<< in terms of unformatted (binary) write operation.
+    struct BinaryOStreamWrapper
+    {
+        BinaryOStreamWrapper(ostream& s) : m_stream(s) {}
+
+        template<typename T>
+        typename std::enable_if<std::is_pod<T>::value, BinaryOStreamWrapper&>::type
+        operator<<(const T& value)
+        { 
+            m_stream.write(reinterpret_cast<const char*>(&value), sizeof(T)); 
+            return *this ; 
+        }
+
+        BinaryOStreamWrapper& operator<<(const wstring& str)
+        { 
+            *this << str.length();
+            m_stream.write(reinterpret_cast<const char*>(str.c_str()), str.length() * sizeof(wchar_t)); 
+            return *this; 
+        }
+
+        operator ostream& () { return m_stream; }
+
+        ostream& m_stream;
+        BinaryOStreamWrapper(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper(BinaryOStreamWrapper&&) = delete; BinaryOStreamWrapper& operator=(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper& operator=(BinaryOStreamWrapper&&) = delete;
+    };
+
+    // This wrapper redefines operator>> in terms of unformatted (binary) read operation.
+    struct BinaryIStreamWrapper
+    {
+        BinaryIStreamWrapper(istream& s) : m_stream(s) {}
+
+        template<typename T>
+        typename std::enable_if<std::is_pod<T>::value, BinaryIStreamWrapper&>::type
+        operator>>(T& value)
+        { 
+            static_assert(sizeof(T) <= sizeof(size_t), "size_t is the largest supported type.");
+            m_stream.read(buf, sizeof(T)); 
+            value = *(reinterpret_cast<T*>(buf));
+            return *this ; 
+        }
+
+        BinaryIStreamWrapper& operator>>(wstring& str)
+        { 
+            size_t length;
+            *this >> length;
+            str.resize(length);
+            for (size_t i = 0; i < length; ++i)
+            {
+                m_stream.read(buf, sizeof(wchar_t)); 
+                str[i] = *(reinterpret_cast<wchar_t*>(buf));
+            }
+
+            return *this; 
+        }
+
+        operator istream& () const { return m_stream ;}
+
+        istream& m_stream;
+        char buf[sizeof(size_t)];
+        BinaryIStreamWrapper(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper(BinaryIStreamWrapper&&) = delete; BinaryIStreamWrapper& operator=(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper& operator=(BinaryIStreamWrapper&&) = delete;
+    };
+
+    template <typename T>
+    T* CreateDataPtr(const T& value)
+    {
+        return new T(value);
+    }
+
+    template <>
+    NDArrayView* CreateDataPtr<NDArrayView>(const NDArrayView& value)
+    {
+        // TODO: replace this copy with an alias to value.
+        NDArrayView* viewPtr = new NDArrayView(value.GetDataType(), value.Shape(), DeviceDescriptor::CPUDevice());
+        viewPtr->CopyFrom(value);
+        return viewPtr;
+    }
+
    template <typename T>
    void DictionaryValue::AllocateDataPtr(const T& value)
    {
        static_assert(is_same<T, NDShape>::value ||
                      is_same<T, wstring>::value ||
                      is_same<T, vector<DictionaryValue>>::value ||
-                      is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
-        m_data.m_ptr = new T(value);
+                      is_same<T, Dictionary>::value ||
+                      is_same<T, NDArrayView>::value,
+                      "AllocateDataPtr called with invalid type");
+        m_data.m_ptr = CreateDataPtr<T>(value);
    }

    template <typename T>
@ -31,12 +111,163 @@ namespace CNTK
        m_data.m_ptr = nullptr;
    }

-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    template <typename ElementType> 
+    bool AreEqual(NDArrayView& view1, NDArrayView& view2)
    {
+        if (view1.GetDataType() != view2.GetDataType() ||
+            view1.Shape() != view2.Shape())
+        {
+            return false;
+        }
+
+        ElementType* data1 = nullptr;
+        ElementType* data2 = nullptr;
+        if (view1.Device().Type() == DeviceKind::CPU)
+        {
+            data1 = view1.WritableDataBuffer<ElementType>();
+            data2 = view2.WritableDataBuffer<ElementType>();
+        }
+        else
+        {
+            NDArrayViewPtr temp1CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view1.Shape(), DeviceDescriptor::CPUDevice());
+            temp1CpuDataView->CopyFrom(view1);
+            data1 = temp1CpuDataView->WritableDataBuffer<ElementType>();
+
+            NDArrayViewPtr temp2CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view2.Shape(), DeviceDescriptor::CPUDevice());
+            temp2CpuDataView->CopyFrom(view2);
+            data2 = temp2CpuDataView->WritableDataBuffer<ElementType>();
+        }
+
+        size_t numElements = view1.Shape().TotalSize();
+
+        for (size_t i = 0; i < numElements; ++i)
+        {
+            if (data1[i] != data2[i])
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool DictionaryValue::operator==(const DictionaryValue& other) const
+    {
+        if (this == &other)
+        {
+            return true;
+        }
+
+        if (m_valueType != other.m_valueType)
+        {
+            return false;
+        }
+        
+        switch (m_valueType)
+        {
+        case DictionaryValue::Type::Bool:
+            return (m_data.m_boolean == other.m_data.m_boolean);
+        case DictionaryValue::Type::SizeT:
+            return (m_data.m_sizeT == other.m_data.m_sizeT);
+        case DictionaryValue::Type::Float:
+            return (m_data.m_float == other.m_data.m_float);
+        case DictionaryValue::Type::Double:
+            return (m_data.m_double == other.m_data.m_double);
+        case DictionaryValue::Type::String:
+        {
+            wstring* strPtr1 = reinterpret_cast<wstring*>(m_data.m_ptr);
+            wstring* strPtr2 = reinterpret_cast<wstring*>(other.m_data.m_ptr);
+            return (*strPtr1 == *strPtr2);
+        }
+        case DictionaryValue::Type::NDShape:
+        {
+            NDShape* shapePtr1 = reinterpret_cast<NDShape*>(m_data.m_ptr);
+            NDShape* shapePtr2 = reinterpret_cast<NDShape*>(other.m_data.m_ptr);
+            return (*shapePtr1 == *shapePtr2);
+        }
+        case DictionaryValue::Type::Vector:
+        {   
+            vector<DictionaryValue>* vectorPtr1 = reinterpret_cast<vector<DictionaryValue>*>(m_data.m_ptr);
+            vector<DictionaryValue>* vectorPtr2 = reinterpret_cast<vector<DictionaryValue>*>(other.m_data.m_ptr);
+            return (*vectorPtr1 == *vectorPtr2);
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr1 = reinterpret_cast<Dictionary*>(m_data.m_ptr);
+            Dictionary* dictPtr2 = reinterpret_cast<Dictionary*>(other.m_data.m_ptr);
+            return (*dictPtr1 == *dictPtr2);
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            NDArrayView* viewPtr1 = reinterpret_cast<NDArrayView*>(m_data.m_ptr);
+            NDArrayView* viewPtr2 = reinterpret_cast<NDArrayView*>(other.m_data.m_ptr);
+
+            switch (viewPtr1->GetDataType())
+            {
+            case DataType::Float:
+                return AreEqual<float>(*viewPtr1, *viewPtr2);
+            case DataType::Double:
+                return AreEqual<double>(*viewPtr1, *viewPtr2);
+            default:
+                NOT_IMPLEMENTED;
+            }
+        }
+        default:
+            NOT_IMPLEMENTED;
+        }
+    }
+    
+    bool DictionaryValue::operator!=(const DictionaryValue& other) const
+    {
+        return !(*this == other);    
+    }
+
+    
+    BinaryOStreamWrapper& operator<<(BinaryOStreamWrapper& stream, const NDShape& us)
+    {
+        auto size = us.NumAxes();
+        stream << size;
+        for (auto i = 0; i < size; i++)
+        {
+            stream << us[i];
+        }
+        return stream;
+    }
+
+    template <typename T>
+    void Write(BinaryOStreamWrapper& stream, const NDArrayView& view)
+    {
+        assert(view.Device().Type() == DeviceKind::CPU);
+
+        auto numElements = view.Shape().TotalSize();
+        const T* buffer = view.DataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            stream << buffer[i];
+        }
+    }
+
+    template <typename T>
+    void Read(BinaryIStreamWrapper& stream, NDArrayView& view)
+    {
+        assert(view.Device().Type() == DeviceKind::CPU);
+        
+        auto numElements = view.Shape().TotalSize();
+        T* buffer = view.WritableDataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            stream >> buffer[i];
+        }
+    }
+
+    istream& operator>>(istream& stdStream, DictionaryValue& us)
+    {
+        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
-
-        stream >> us.m_valueType;
+        
+        unsigned int type;
+        stream >> type;
+        us.m_valueType = static_cast<DictionaryValue::Type>(type);

        switch (us.ValueType())
        {
@ -52,28 +283,72 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream >> us.m_data.m_double;
            break;
+        case DictionaryValue::Type::String:
+        {
+            wstring* strPtr = new wstring();
+            stream >> *strPtr;
+            us.m_data.m_ptr = strPtr;
+            break;
+        }
        case DictionaryValue::Type::NDShape:
        {
            size_t size;
            stream >> size;
-            vector<size_t> dims(size);
+            NDShape* shapePtr = new NDShape(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> dims[i];
+                stream >> shapePtr->operator[](i);
            }
-            us.AllocateDataPtr(NDShape(dims));
+            us.m_data.m_ptr = shapePtr;
            break;
        }
        case DictionaryValue::Type::Vector:
-        {
+        {   
            size_t size;
            stream >> size;
-            vector<DictionaryValue> values(size);
+            vector<DictionaryValue>* vectorPtr = new vector<DictionaryValue>(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> values[i];
+                stream >> vectorPtr->at(i);
            }
-            us.AllocateDataPtr(values);
+            us.m_data.m_ptr = vectorPtr;
+            break;
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr = new Dictionary();
+            stream >> *dictPtr;
+            us.m_data.m_ptr = dictPtr;
+            break;
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            unsigned int type;
+            stream >> type;
+            DataType dtype = static_cast<DataType>(type);
+
+            size_t size;
+            stream >> size;
+            NDShape shape(size);
+            for (auto i = 0; i < size; i++)
+            {
+                stream >> shape[i];
+            }
+
+            NDArrayView* viewPtr = new NDArrayView(dtype, shape, DeviceDescriptor::CPUDevice());
+            switch (dtype)
+            {
+            case DataType::Float:
+                Read<float>(stream, *viewPtr);
+                break;
+            case DataType::Double:
+                Read<double>(stream, *viewPtr);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(dtype));
+            }
+
+            us.m_data.m_ptr = viewPtr;
            break;
        }
        default:
@ -82,11 +357,13 @@ namespace CNTK
        return stream;
    }

-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    ostream& operator<<(ostream& stdStream, const DictionaryValue& us)
    {
+        BinaryOStreamWrapper stream(stdStream);
+
        stream << us.version;

-        stream << us.ValueType();
+        stream << static_cast<unsigned int>(us.ValueType());

        switch (us.ValueType())
        {
@ -102,15 +379,16 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream << us.m_data.m_double;
            break;
+        case DictionaryValue::Type::String:
+        {
+            wstring* stringPtr = reinterpret_cast<wstring*>(us.m_data.m_ptr);
+            stream << *stringPtr;
+            break;
+        }
        case DictionaryValue::Type::NDShape:
        {
            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
-            auto size = shapePtr->NumAxes();
-            stream << size;
-            for (auto i = 0; i < size; i++)
-            {
-                stream << shapePtr->operator[](i);
-            }
+            stream << *shapePtr;
            break;
        }
        case DictionaryValue::Type::Vector:
@ -121,7 +399,31 @@ namespace CNTK
            stream << size;
            for (auto i = 0; i < size; i++)
            {
-                stream << vectorPtr->operator[](i);
+                stream << vectorPtr->at(i);
+            }
+            break;
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr = reinterpret_cast<Dictionary*>(us.m_data.m_ptr);
+            stream << *dictPtr;
+            break;
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            NDArrayView* viewPtr = reinterpret_cast<NDArrayView*>(us.m_data.m_ptr);
+            stream << static_cast<unsigned int>(viewPtr->GetDataType());
+            stream << viewPtr->Shape();
+            switch (viewPtr->GetDataType())
+            {
+            case DataType::Float:
+                Write<float>(stream, *viewPtr);
+                break;
+            case DataType::Double:
+                Write<double>(stream, *viewPtr);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
            }
            break;
        }
@ -148,7 +450,7 @@ namespace CNTK
    Dictionary& Dictionary::operator=(const Dictionary& other)
    {
        assert(this != &other);
-        m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
+        m_dictionaryData.reset(new unordered_map<wstring, DictionaryValue>(*(other.m_dictionaryData)));
        return *this;
    }

@ -183,20 +485,51 @@ namespace CNTK
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }

-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    bool Dictionary::operator==(const Dictionary& other) const
    {
+        if (this == &other)
+        {
+            return true;
+        }
+
+        if (m_dictionaryData->size() != other.m_dictionaryData->size())
+        {
+            return false;
+        }
+        
+        for (auto& kv : *m_dictionaryData)
+        {
+            auto result = other.m_dictionaryData->find(kv.first);
+            if (result == other.m_dictionaryData->end() || kv.second != result->second)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+    
+    bool Dictionary::operator!=(const Dictionary& other) const
+    {
+        return !(*this == other);    
+    }
+
+    ostream& operator<<(ostream& stdStream, const Dictionary& us)
+    {
+        BinaryOStreamWrapper stream(stdStream);
        stream << us.version;
        stream << us.m_dictionaryData->size();
-        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        for (auto& kv : *(us.m_dictionaryData))
        {
-            stream << it->first;
-            stream << it->second;
+            stream << kv.first;
+            stream << kv.second;
        }
        return stream;
    }

-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    istream& operator>>(istream& stdStream, Dictionary& us)
    {
+        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
        size_t size;
@ -206,113 +539,36 @@ namespace CNTK
        {
            wstring key;
            stream >> key;
-            DictionaryValue value;
-            stream >> value;
-            us.m_dictionaryData->insert(make_pair(key, value));
+            stream >> us[key];
        }
        return stream;
    }

+    // Returns the element whose key is greater than the required sample count 
+    // or the last element if no such key exists.
    template <typename T>
-    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    const T& TrainingParameterSchedule<T>::operator[](size_t sampleCount) const
    {
-        if (viewPtr->IsSparse())
+        assert(m_schedule.size() > 0);
+        auto it = m_schedule.upper_bound(sampleCount);
+        if (it == m_schedule.end())
        {
-            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+            --it;
        }
-
-        auto numElements = viewPtr->Shape().TotalSize();
-
-        vector<DictionaryValue> values(numElements);
-
-        NDArrayViewPtr cpuDataViewPtr = viewPtr;
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
-            cpuDataViewPtr->CopyFrom(*viewPtr);
-        }
-
-        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
-        for (auto i = 0; i < numElements; ++i)
-        {
-            T v = buffer[i];
-            values[i] = DictionaryValue(v);
-        }
-
-        return values;
+        return it->second;
    }

-    template <typename T>
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
-    {
-        if (viewPtr->IsSparse())
-        {
-            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
-        }
-
-        auto numElements = viewPtr->Shape().TotalSize();
-
-        if (values.size() != numElements)
-        {
-            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
-                        values.size(), numElements);
-        }
-
-        NDArrayViewPtr cpuDataViewPtr = viewPtr;
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
-        }
-
-        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
-        for (auto i = 0; i < numElements; ++i)
-        {
-            buffer[i] = values[i].GetValue<T>();
-        }
-
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            viewPtr->CopyFrom(*cpuDataViewPtr);
-        }
-    }
-
-    // TODO: we store the type info for every element in the vector, which is extremely redundant.
-    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
-    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
-    {
-        switch (viewPtr->GetDataType())
-        {
-        case DataType::Float:
-            return SerializeToVector<float>(viewPtr);
-        case DataType::Double:
-            return SerializeToVector<double>(viewPtr);
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
-        }
-    }
-
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
-    {
-        switch (viewPtr->GetDataType())
-        {
-        case DataType::Float:
-            DeserializeFromVector<float>(viewPtr, values);
-            break;
-        case DataType::Double:
-            DeserializeFromVector<double>(viewPtr, values);
-            break;
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
-        }
-    }
-     
    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
    template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
    template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
+    template void DictionaryValue::AllocateDataPtr<NDArrayView>(const NDArrayView& value);

    template void DictionaryValue::FreePtrAsType<NDShape>();
    template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
    template void DictionaryValue::FreePtrAsType<wstring>();
    template void DictionaryValue::FreePtrAsType<Dictionary>();
+    template void DictionaryValue::FreePtrAsType<NDArrayView>();
+
+    template const double& TrainingParameterSchedule<double>::operator[](size_t key) const;
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -167,10 +167,6 @@ namespace CNTK
        return var.IsInput() && var.IsSparse();
    }

-    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
-
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
-
    inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
    {
        for (size_t i = 0; i < numIndentationSpaces; ++i)
@ -313,4 +309,9 @@ namespace CNTK

        return{ paddedOutputMapCount, kernelShape };
    }
+
+    inline double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+    {
+        return std::pow(momentumPerSample, minibatchSize);
+    }
 }
--- a/Tests/UnitTests/V2LibraryTests/Common.h
+++ b/Tests/UnitTests/V2LibraryTests/Common.h
@ -1,9 +1,10 @@
 #pragma once
-
 #include <exception>
 #include <algorithm>
 #include "CNTKLibrary.h"
 #include <functional>
+#include <fstream>
+#include <random>

 static const double relativeTolerance = 0.001f;
 static const double absoluteTolerance = 0.000001f;
@ -21,6 +22,8 @@ inline void FloatingPointVectorCompare(const std::vector<ElementType>& first, co
    }
 }

+static std::mt19937_64 rng(0);
+
 #pragma warning(push)
 #pragma warning(disable: 4996)

@ -40,6 +43,12 @@ static inline int _wunlink(const wchar_t *p)
 {
    return unlink(wtocharpath(p).c_str());
 }
+
+static inline FILE *_wfopen(const wchar_t *path, const wchar_t *mode)
+{
+    return fopen(wtocharpath(path).c_str(), wtocharpath(mode).c_str());
+}
+
 #endif

 template <typename ElementType>
@ -112,3 +121,30 @@ inline float PrevMinibatchTrainingLossValue(const CNTK::Trainer& trainer)
 }

 #pragma warning(pop)
+
+inline CNTK::NDShape CreateShape(size_t numAxes, size_t maxDimSize)
+{
+    CNTK::NDShape shape(numAxes);
+    for (size_t i = 0; i < numAxes; ++i)
+    {
+        shape[i] = (rng() % maxDimSize) + 1;
+    }
+
+    return shape;
+}
+
+inline void OpenStream(std::fstream& stream, const std::wstring& filename, bool readonly)
+{
+    if (filename.empty())
+        std::runtime_error("File: filename is empty");
+
+    std::ios_base::openmode mode = std::ios_base::binary;
+    mode = mode | (readonly ? std::ios_base::in : std::ios_base::out);
+
+    #ifdef _MSC_VER
+    stream.open(filename.c_str(), mode);
+    #else
+    stream.open(wtocharpath(filename.c_str()).c_str(), mode);
+    #endif
+    stream.exceptions(std::ios_base::failbit | std::ios_base::badbit);  
+}
--- a/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
@ -0,0 +1,185 @@
+#include "CNTKLibrary.h"
+#include "Common.h"
+#include <string>
+#include <random>
+#include <initializer_list>
+
+
+using namespace CNTK;
+using namespace std;
+
+static const size_t maxMinibatchSize = 1000;
+
+static const size_t maxNumAxes = 5;
+static const size_t maxDimSize = 10;
+
+template <typename ElementType>
+void TestUpdate(LearnerPtr& learner, NDShape& shape, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    auto seed = (unsigned long) rng();
+    unordered_map<Parameter, NDArrayViewPtr> gradientValues;
+    for (auto i = 0; i < numMinibatches; i++)
+    { 
+        for (auto& parameter : learner->Parameters())
+        {
+            gradientValues[parameter] = NDArrayView::RandomUniform<ElementType>(shape, -1.0, 1.0, seed + i, device);
+        }
+
+        learner->Update(gradientValues, 1);
+    }
+}
+
+template <typename ElementType>
+unordered_set<Parameter> CreateParameters(const NDShape& shape, size_t numParameters, const DeviceDescriptor& device)
+{
+    unordered_set<Parameter> parameters;
+    for (int i = 0; i < numParameters; i++)
+    {
+        parameters.insert(
+            Parameter(NDArrayView::RandomUniform<ElementType>(shape, -1.0, 1.0, i, device), 
+                      L"parameter_" + to_wstring(i)));
+    }
+    return parameters;
+}
+  
+template <typename ElementType>
+void TestSGDLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = SGDLearner(parameters, 0.4);
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+template <typename ElementType>
+void TestMomentumSGDLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    MomentumsPerSample momentums({ { 1, 1.0 }, { 3, 0.1 }, { 10, 0.01 } }, 2);
+    auto learner = MomentumSGDLearner(parameters, vector<double>{0.3, 0.2, 0.1}, momentums);
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+template <typename ElementType>
+void TestNesterovLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = NesterovLearner(parameters, LearningRatesPerSample({ { 1, 0.5 }, { 10, 0.25 }, { 20, 0.125 } }, 3 ), 0.2);
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+template <typename ElementType>
+void TestAdaGradLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = AdaGradLearner(parameters, { vector<double>{0.5, 0.4, 0.3, 0.2, 0.1}, 2 }, true);
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+template <typename ElementType>
+void TestFSAdaGradLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = FSAdaGradLearner(parameters, vector<double>{ 0.5 }, vector<double>{0.05});
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+template <typename ElementType>
+void TestRMSPropLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = RMSPropLearner(parameters, { { 3, 0.7 }, { 1, 0.2 } }, 0.01, 0.02, 0.03, 0.1, 0.001 );
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
+void TestTrainingParametersSchedule() 
+{
+    LearningRatesPerSample schedule1 = 0.5;
+    assert(schedule1[0] == 0.5);
+    assert(schedule1[1] == 0.5);
+    assert(schedule1[100] == 0.5);
+
+    LearningRatesPerSample schedule2 = vector<double>{ 0.5 };
+    assert(schedule2[0] == 0.5);
+    assert(schedule2[10] == 0.5);
+    assert(schedule2[100] == 0.5);
+
+    LearningRatesPerSample schedule3 = vector<double>{ 0.5, 0.3 };
+    assert(schedule3[0] == 0.5);
+    assert(schedule3[1] == 0.3);
+    assert(schedule3[100] == 0.3);
+
+    LearningRatesPerSample schedule4 = { vector<double>{ 0.5 }, 10 };
+    assert(schedule4[0] == 0.5);
+    assert(schedule4[10] == 0.5);
+    assert(schedule4[100] == 0.5);
+
+    LearningRatesPerSample schedule5 = { vector<double>{ 0.5, 0.3, 0.2 }, 10 };
+    assert(schedule5[0] == 0.5);
+    assert(schedule5[9] == 0.5);
+    assert(schedule5[10] == 0.3);
+    assert(schedule5[19] == 0.3);
+    assert(schedule5[20] == 0.2);
+    assert(schedule5[100] == 0.2);
+
+    LearningRatesPerSample schedule6 = { {1, 0.5} };
+    assert(schedule6[0] == 0.5);
+    assert(schedule6[10] == 0.5);
+    assert(schedule6[100] == 0.5);
+
+    LearningRatesPerSample schedule7 = { { 1, 0.5 }, { 1, 0.3 }, {1, 0.2} };
+    assert(schedule7[0] == 0.5);
+    assert(schedule7[1] == 0.3);
+    assert(schedule7[2] == 0.2);
+    assert(schedule7[100] == 0.2);
+
+    LearningRatesPerSample schedule8( { { 1, 0.5 }, { 1, 0.3 }, { 1, 0.2 } }, 10 );
+    assert(schedule8[0] == 0.5);
+    assert(schedule8[9] == 0.5);
+    assert(schedule8[10] == 0.3);
+    assert(schedule8[19] == 0.3);
+    assert(schedule8[20] == 0.2);
+    assert(schedule8[100] == 0.2);
+
+    LearningRatesPerSample schedule9 = { { 3, 0.5 }, { 2, 0.3 }, {1, 0.2} };
+    assert(schedule9[0] == 0.5);
+    assert(schedule9[2] == 0.5);
+    assert(schedule9[3] == 0.3);
+    assert(schedule9[4] == 0.3);
+    assert(schedule9[5] == 0.2);
+    assert(schedule9[100] == 0.2);
+
+    LearningRatesPerSample schedule10( { { 3, 0.5 }, { 2, 0.3 }, { 1, 0.2 } }, 10 );
+    assert(schedule10[0] == 0.5);
+    assert(schedule10[29] == 0.5);
+    assert(schedule10[30] == 0.3);
+    assert(schedule10[49] == 0.3);
+    assert(schedule10[50] == 0.2);
+    assert(schedule10[100] == 0.2);
+}
+
+
+void LearnerTests()
+{
+    TestTrainingParametersSchedule();
+
+    TestSGDLearner<double>(5, 3, DeviceDescriptor::CPUDevice());
+
+#ifndef CPUONLY
+    TestMomentumSGDLearner<float>(3, 11, DeviceDescriptor::GPUDevice(0));
+    TestNesterovLearner<float>(1, 20, DeviceDescriptor::GPUDevice(0));
+#else
+    TestMomentumSGDLearner<float>(3, 11, DeviceDescriptor::CPUDevice());
+    TestNesterovLearner<float>(1, 20, DeviceDescriptor::CPUDevice());
+#endif
+    
+    TestAdaGradLearner<double>(2, 10, DeviceDescriptor::CPUDevice());
+    
+    TestFSAdaGradLearner<double>(10, 2, DeviceDescriptor::CPUDevice());
+    TestRMSPropLearner<float>(3, 3, DeviceDescriptor::CPUDevice());
+}
--- a/Tests/UnitTests/V2LibraryTests/Main.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Main.cpp
@ -9,6 +9,8 @@ void FeedForwardTests();
 void RecurrentFunctionTests();
 void TrainerTests();
 void TestCifarResnet();
+void SerializationTests();
+void LearnerTests();

 int main()
 {
@ -19,6 +21,8 @@ int main()
    RecurrentFunctionTests();

    TrainerTests();
+    SerializationTests();
+    LearnerTests();

    TestCifarResnet();

--- a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
@ -0,0 +1,224 @@
+#include "CNTKLibrary.h"
+#include "Common.h"
+#include <string>
+#include <random>
+#include <vector>
+
+
+using namespace CNTK;
+using namespace std;
+
+using namespace Microsoft::MSR::CNTK;
+
+static const size_t maxNestingDepth = 10;
+static const size_t maxNestedDictSize = 10;
+static const size_t maxNestedVectorSize = 100;
+static const size_t maxNDShapeSize = 100;
+
+static const size_t maxNumAxes = 10;
+static const size_t maxDimSize = 15;
+
+
+static size_t keyCounter = 0;
+static uniform_real_distribution<double> double_dist = uniform_real_distribution<double>();
+static uniform_real_distribution<float> float_dist = uniform_real_distribution<float>();
+
+static std::wstring tempFilePath = L"serialization.tmp";
+
+DictionaryValue CreateDictionaryValue(DictionaryValue::Type, size_t);
+
+DictionaryValue::Type GetType()
+{
+    return DictionaryValue::Type(rng() % (unsigned int) DictionaryValue::Type::NDArrayView + 1);
+}
+
+void AddKeyValuePair(Dictionary& dict, size_t depth)
+{
+    auto type = GetType();
+    while (depth >= maxNestingDepth && 
+           type == DictionaryValue::Type::Vector ||
+           type == DictionaryValue::Type::Dictionary)
+    {
+        type = GetType();
+    }
+    dict[L"key" + to_wstring(keyCounter++)] = CreateDictionaryValue(type, depth);
+}
+
+Dictionary CreateDictionary(size_t size, size_t depth = 0) 
+{
+    Dictionary dict;
+    for (auto i = 0; i < size; ++i)
+    {
+        AddKeyValuePair(dict, depth);
+    }
+
+    return dict;
+}
+
+template <typename ElementType>
+NDArrayViewPtr CreateNDArrayView(size_t numAxes, const DeviceDescriptor& device) 
+{
+    NDShape viewShape(numAxes);
+    for (size_t i = 0; i < numAxes; ++i)
+        viewShape[i] = (rng() % maxDimSize) + 1;
+
+    return NDArrayView::RandomUniform<ElementType>(viewShape, ElementType(-4.0), ElementType(19.0), 1, device);
+}
+
+NDArrayViewPtr CreateNDArrayView()
+{
+    auto numAxes = (rng() % maxNumAxes) + 1;
+    auto device = DeviceDescriptor::CPUDevice();
+#ifndef CPUONLY
+    if (rng() % 2 == 0)
+    {
+        device = DeviceDescriptor::GPUDevice(0);
+    }
+#endif
+
+    return (rng() % 2 == 0) ? 
+        CreateNDArrayView<float>(numAxes, device) : CreateNDArrayView<double>(numAxes, device);
+}
+
+DictionaryValue CreateDictionaryValue(DictionaryValue::Type type, size_t depth)
+{
+    switch (type)
+    {
+    case DictionaryValue::Type::Bool:
+        return DictionaryValue(!!(rng() % 2));
+    case DictionaryValue::Type::SizeT:
+        return DictionaryValue(rng());
+    case DictionaryValue::Type::Float:
+        return DictionaryValue(float_dist(rng));
+    case DictionaryValue::Type::Double:
+        return DictionaryValue(double_dist(rng));
+    case DictionaryValue::Type::String:
+        return DictionaryValue(to_wstring(rng()));
+    case DictionaryValue::Type::NDShape:
+    {
+        size_t size = rng() % maxNDShapeSize + 1;
+        NDShape shape(size);
+        for (auto i = 0; i < size; i++)
+        {
+            shape[i] = rng();
+        }
+        return DictionaryValue(shape);
+    }
+    case DictionaryValue::Type::Vector:
+    {   
+        auto type = GetType();
+        size_t size = rng() % maxNestedVectorSize + 1;
+        vector<DictionaryValue> vector(size);
+        for (auto i = 0; i < size; i++)
+        {
+            vector[i] = CreateDictionaryValue(type, depth + 1);
+        }
+        return DictionaryValue(vector);
+    }
+    case DictionaryValue::Type::Dictionary:
+        return DictionaryValue(CreateDictionary(rng() % maxNestedDictSize  + 1, depth + 1));
+    case DictionaryValue::Type::NDArrayView:
+        return DictionaryValue(*(CreateNDArrayView()));
+    default:
+        NOT_IMPLEMENTED;
+    }
+}
+
+void TestDictionarySerialization(size_t dictSize) 
+{
+    if ((_wunlink(tempFilePath.c_str()) != 0) && (errno != ENOENT))
+        std::runtime_error("Error deleting temporary test file 'serialization.tmp'.");
+
+    Dictionary originalDict = CreateDictionary(dictSize);
+    
+    {
+        fstream stream;
+        OpenStream(stream, tempFilePath, false);
+        stream << originalDict;
+        stream.flush();
+    }
+
+    Dictionary deserializedDict;
+
+    {
+        fstream stream;
+        OpenStream(stream, tempFilePath, true);
+        stream >> deserializedDict;
+    }
+    
+    if (originalDict != deserializedDict)
+        throw std::runtime_error("TestDictionarySerialization: original and deserialized dictionaries are not identical.");
+}
+
+template <typename ElementType>
+void TestLearnerSerialization(int numParameters, const DeviceDescriptor& device) 
+{
+    if ((_wunlink(tempFilePath.c_str()) != 0) && (errno != ENOENT))
+        std::runtime_error("Error deleting temporary test file 'serialization.tmp'.");
+
+    NDShape shape = CreateShape(5, maxDimSize);
+
+    unordered_set<Parameter> parameters;
+    unordered_map<Parameter, NDArrayViewPtr> gradientValues;
+    for (int i = 0; i < numParameters; i++)
+    {
+        Parameter parameter(NDArrayView::RandomUniform<ElementType>(shape, -0.5, 0.5, i, device), L"parameter_" + to_wstring(i));
+        parameters.insert(parameter);
+        gradientValues[parameter] = NDArrayView::RandomUniform<ElementType>(shape, -0.5, 0.5, numParameters + i, device);
+    }
+
+    auto learner1 = SGDLearner(parameters, 0.05);
+    
+    learner1->Update(gradientValues, 1);
+
+    {
+        auto checkpoint = learner1->GetCheckpointState();
+        fstream stream;
+        OpenStream(stream, tempFilePath, false);
+        stream << checkpoint;
+        stream.flush();
+    }
+
+    auto learner2 = SGDLearner(parameters, 0.05);
+
+    {
+        Dictionary checkpoint;
+        fstream stream;
+        OpenStream(stream, tempFilePath, true);
+        stream >> checkpoint;
+        learner2->RestoreFromCheckpoint(checkpoint);
+    }
+
+    int i = 0;
+    for (auto parameter : parameters)
+    {
+        gradientValues[parameter] = NDArrayView::RandomUniform<ElementType>(shape, -0.5, 0.5, 2*numParameters + i, device);
+        i++;
+    }
+
+    learner1->Update(gradientValues, 1);
+    learner2->Update(gradientValues, 1);
+
+     auto checkpoint1 = learner1->GetCheckpointState();
+     auto checkpoint2 = learner2->GetCheckpointState();
+    
+    if (checkpoint1 != checkpoint2)
+        throw std::runtime_error("TestLearnerSerialization: original and restored from a checkpoint learners diverge.");
+}
+
+
+
+void SerializationTests()
+{
+    TestDictionarySerialization(4);
+    TestDictionarySerialization(8);
+    TestDictionarySerialization(16);
+
+    TestLearnerSerialization<float>(5, DeviceDescriptor::CPUDevice());
+    TestLearnerSerialization<double>(10, DeviceDescriptor::CPUDevice());
+
+#ifndef CPUONLY
+    TestLearnerSerialization<float>(5, DeviceDescriptor::GPUDevice(0));
+    TestLearnerSerialization<double>(10, DeviceDescriptor::GPUDevice(0));;
+#endif
+}
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
@ -110,6 +110,8 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="CifarResNet.cpp" />
+    <ClCompile Include="LearnerTests.cpp" />
+    <ClCompile Include="SerializationTests.cpp" />
    <ClCompile Include="FeedForwardTests.cpp" />
    <ClCompile Include="Main.cpp" />
    <ClCompile Include="NDArrayViewTests.cpp" />
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
@ -36,6 +36,12 @@
    <ClCompile Include="CifarResNet.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="SerializationTests.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="LearnerTests.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Common.h">