Merge branch 'master' into jiajia/IRMetricV3

2016-10-12 13:53:12 -07:00 · 2016-10-12 13:53:12 -07:00 · cc4eab4627
--- a/.gitattributes
+++ b/.gitattributes
@ -32,6 +32,7 @@ Readme text
 *.pl text
 *.ps1 text
 *.ps text
+*.i text

 *.sh text eol=lf
 build-and-test text eol=lf
@ -57,6 +58,7 @@ Makefile text
 *.vssettings text
 *.csproj text
 *.props text
+*.asax text

 *.h text
 *.cpp text
@ -81,6 +83,8 @@ Makefile text

 *.log text

+Dockerfile. text
+
 # Speech data
 mean.363 text
 var.363 text
--- a/Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp
+++ b/Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp
@ -289,13 +289,13 @@ void OutputFunctionInfo(FunctionPtr func)
    auto inputVariables = func->Arguments();
    fprintf(stderr, "Function %S: Input Variables (count=%lu)\n", func->Name().c_str(), inputVariables.size());
    for_each(inputVariables.begin(), inputVariables.end(), [](const Variable v) {
-        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), v.Kind());
+        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), static_cast<int>(v.Kind()));
    });

    auto outputVariables = func->Outputs();
    fprintf(stderr, "Function %S: Output Variables (count=%lu)\n", func->Name().c_str(), outputVariables.size());
    for_each(outputVariables.begin(), outputVariables.end(), [](const Variable v) {
-        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), v.Kind());
+        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), static_cast<int>(v.Kind()));
    });
 }

--- a/1
+++ b/1
@ -355,6 +355,7 @@ COMPUTATION_NETWORK_LIB_SRC =\
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkEditing.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkBuilder.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkScripting.cpp \
+	$(SOURCEDIR)/ComputationNetworkLib/TrainingNodes.cpp \

 SEQUENCE_TRAINING_LIB_SRC =\
 	$(SOURCEDIR)/SequenceTrainingLib/latticeforwardbackward.cpp \
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -459,7 +459,7 @@ CNTK2 = [
        if axis==0 then new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = _AsNodes (labelSequence : outProbVectorSequence) /*plus the function args*/ ]
        else [ tag1 = tag; out = Minus (ReduceLogSum (outProbVectorSequence, axis=axis), ReduceSum (labelSequence .* outProbVectorSequence, axis=axis), tag=tag1) ].out
    # Classification error along a specific axis: account only for missed labels, i.e.
-    # strictly check whether at the one “1” location in labels we find a value equal to the max
+    # strictly check whether at the one '1' location in labels we find a value equal to the max
    ClassificationError(labelSequence, outVectorSequence, topN=1, axis=0, tag='') =
        if axis==0 then new ComputationNode [ operation = 'ClassificationError' ; inputs = _AsNodes (if topN == 1 then (labelSequence : outVectorSequence) else  (labelSequence : outVectorSequence : Constant (topN))) /*plus the function args*/ ]
        else if topN != 1 then Fail ("ClassificationError() along a specific axis does not support topN.")
@ -485,9 +485,26 @@ CNTK2 = [
    NotEqual(_, y, tag='')     = new ComputationNode [ operation = 'NotEqual'     ; inputs = _AsNodes (_ : y) /*plus the function args*/ ]
    LessEqual(_, y, tag='')    = new ComputationNode [ operation = 'LessEqual'    ; inputs = _AsNodes (_ : y) /*plus the function args*/ ]

-    // 13. Others    
-    Pass(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _AsNodes (_) /*plus the function args*/ ]    
+    // 13. Others
+    Pass(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _AsNodes (_) /*plus the function args*/ ]
    Identity = Pass
+
+    // The value of GetRandomSample(weights /* vector of length nClasses */, numSamples, sampleWithReplacement) randomly samples numSamples using the specified sampling weights. 
+    // The result is a sparse matrix of num samples one-hot vectors as columns.
+    GetRandomSample(_ ,numSamples, sampleWithReplacement, tag='') = new ComputationNode [
+                                                                                        operation = 'RandomSample' ;
+                                                                                        sizeOfSampledSet = numSamples;
+                                                                                        allowDuplicates = sampleWithReplacement;
+                                                                                        inputs = _ /*plus the function args*/ ]
+
+    // The value of GetInclusion(weights /* vector of length nClasses */, numSamples, sampleWithReplacement) has to be seen in cojuction to GetRandomSample(...).
+    // While GetRandomSample(...) creates a set of samples, GetInclusion(...) tells how often each class is expected to occur in the sampled sets.
+    // For sampling with replacment the relation to the sampling weights is trivial but not for sampling without replacment.
+    GetInclusionFrequency(_ ,numSamples, sampleWithReplacement, tag='') = new ComputationNode [
+                                                                                        operation = 'RandomSampleInclusionFrequency' ;
+                                                                                        sizeOfSampledSet = numSamples;
+                                                                                        allowDuplicates = sampleWithReplacement;
+                                                                                        inputs = _ /*plus the function args*/ ]
 ]

 # Parameter{} can do several forms of initialization.
--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -72,6 +72,7 @@ namespace CNTK
    {
        // The CNTK reader implementation requires for each deserializer both the module and deserializer type be specified
        // This is redundant and the V2 API users will just specify type from which the module is automatically inferred
+        // TODO: This should be done in the same manner for CNTK exe as well.
        Dictionary augmentedConfiguration = configuration;
        auto& deserializerConfigurations = augmentedConfiguration[L"deserializers"].Value<std::vector<DictionaryValue>>();
        for (auto& deserializerConfig : deserializerConfigurations)
@ -129,11 +130,14 @@ namespace CNTK

        typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
        CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
-        m_compositeDataReader.reset(createReaderProc(&config));
+        std::shared_ptr<Microsoft::MSR::CNTK::Reader> compositeDataReader(createReaderProc(&config));

-        auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
-        for (auto streamDesc : compositeDataReaderStreamDescs)
+        m_compositeDataReaderStreamDescs = compositeDataReader->GetStreamDescriptions();
+        for (auto streamDesc : m_compositeDataReaderStreamDescs)
            m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
+
+        m_shim = std::shared_ptr<ReaderShim<float>>(new ReaderShim<float>(compositeDataReader), [](ReaderShim<float>* x) { x->Destroy(); });
+        m_shim->Init(config);
    }

    /*virtual*/ const std::unordered_map<StreamInformation, MinibatchData>&
@ -155,59 +159,70 @@ namespace CNTK
            {
                // TODO: Add support for distributed reading
                EpochConfiguration epochConfig = { 1, 0, minibatchSizeInSamples, m_epochSize, 0, 0 };
+                m_matrices.clear();

-                std::map<std::wstring, int> requiredStreams;
+                std::unordered_set<InputStreamDescription> inputs;
                for (const auto& s : m_streamInfos)
-                    // Allocating all on CPU for now.
-                    requiredStreams[s.m_name] = CPUDEVICE;
+                {
+                    assert(s.m_storageFormat == StorageFormat::Dense || s.m_storageFormat == StorageFormat::SparseCSC);
+                    auto inputStreamDescription = InputStreamDescription(
+                        s.m_name,
+                        AsCNTKImplDeviceId(device),
+                        s.m_storageFormat == StorageFormat::Dense ? MatrixType::DENSE : MatrixType::SPARSE,
+                        s.m_storageFormat == StorageFormat::Dense ? MatrixFormat::matrixFormatDense : MatrixFormat::matrixFormatSparseCSC);
+                    inputs.insert(inputStreamDescription);

-                m_compositeDataReader->StartEpoch(epochConfig, requiredStreams);
+                    if (s.m_elementType == DataType::Float)
+                    {
+                        auto iter = std::find_if(m_compositeDataReaderStreamDescs.begin(), m_compositeDataReaderStreamDescs.end(), [s](StreamDescriptionPtr& streamInfo) {
+                            return streamInfo->m_id == s.m_id;
+                        });
+                        assert(iter != m_compositeDataReaderStreamDescs.end());
+
+                        m_matrices.AddInput(
+                            s.m_name,
+                            std::make_shared<Matrix<float>>(0, 0, inputStreamDescription.GetDeviceId(), inputStreamDescription.GetMatrixType(), inputStreamDescription.GetMatrixFormat()),
+                            std::make_shared<MBLayout>(),
+                            *(*iter)->m_sampleLayout);
+                    }
+                    else
+                        LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
+                }
+
+                m_shim->StartEpoch(epochConfig, inputs);
                m_prevMinibatchSize = minibatchSizeInSamples;
            }

            if (minibatchSizeInSamples != m_prevMinibatchSize)
                LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");

-            auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
-            m_epochEndReached = compositeReaderMinibatchData.m_endOfEpoch;
+            auto compositeReaderMinibatchDataEmpty = m_shim->GetMinibatch(m_matrices);
+            m_epochEndReached = m_shim->IsEndOfEpoch();

-            auto& streamInfos = StreamInfos();
-            auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
-            size_t numStreams = compositeDataReaderStreamDescs.size();
-            for (size_t i = 0; i < numStreams; ++i)
+            for (const auto& s: m_streamInfos)
            {
-                auto currentStreamDesc = compositeDataReaderStreamDescs[i];
-                auto iter = std::find_if(streamInfos.begin(), streamInfos.end(), [currentStreamDesc](const StreamInformation& streamInfo) {
-                    return streamInfo.m_id == currentStreamDesc->m_id;
-                });
-
-                if (iter == streamInfos.end())
-                    continue;
-
-                auto& currentStreamInfo = *iter;
-                auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
+                auto input = m_matrices.GetInput(s.m_name);
+                auto& currentStreamInfo = s;

                ValuePtr minibatchValuePtr;
-                if (compositeReaderMinibatchData.m_data.empty())
+                if (!compositeReaderMinibatchDataEmpty)
                {
-                    minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
+                    minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, s.m_sampleLayout.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
                    continue;
                }

-                auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
-                if (currentStreamDesc->m_elementType == ElementType::tfloat)
+                if (s.m_elementType == DataType::Float)
                {
-                    auto CNTKMatrixType = (currentStreamDesc->m_storageType == StorageType::dense) ? DENSE : SPARSE;
-                    auto CNTKMatrixFormat = (currentStreamDesc->m_storageType == StorageType::dense) ? matrixFormatDense : matrixFormatSparseCSC;
-                    auto dataMatrix = std::make_shared<Matrix<float>>(0, 0, CPUDEVICE, CNTKMatrixType, CNTKMatrixFormat);
-                    size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
+                    auto matrixType = (s.m_storageFormat == StorageFormat::Dense) ? DENSE : SPARSE;
+                    auto matrixFormat = (s.m_storageFormat == StorageFormat::Dense) ? matrixFormatDense : matrixFormatSparseCSC;
+                    // Can we reuse this, not allocating it each time?
+                    auto dataMatrix = std::make_shared<Matrix<float>>(0, 0, input.GetMatrix<float>().GetDeviceId(), matrixType, matrixFormat);

-                    // TODO: Eliminate the unnecessary CPU to CPU copy
-                    ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData, nullptr);
-                    minibatchValuePtr = MakeSharedObject<PackedValue>(sampleShape, dataMatrix, currentStreamMinibatchData->m_layout, /*readOnly =*/ false);
+                    std::swap(*dataMatrix, input.GetMatrix<float>());
+                    minibatchValuePtr = MakeSharedObject<PackedValue>(s.m_sampleLayout, dataMatrix, input.pMBLayout, /*readOnly =*/ false);

-                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
-                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
+                    size_t numSamples = input.pMBLayout->GetActualNumSamples();
+                    size_t numSequences = input.pMBLayout->GetNumSequences();

                    m_minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
                }
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -9,6 +9,7 @@
 #include "CNTKLibrary.h"
 #include "Utils.h"
 #include "Reader.h"
+#include "ReaderShim.h"

 namespace CNTK
 {
@ -25,10 +26,17 @@ namespace CNTK

    private: 
        std::unordered_set<StreamInformation> m_streamInfos;
-        std::shared_ptr<Microsoft::MSR::CNTK::Reader> m_compositeDataReader;
        bool m_epochEndReached;
        size_t m_prevMinibatchSize;
        size_t m_epochSize;
        std::unordered_map<StreamInformation, MinibatchData> m_minibatchData;
+        std::vector<Microsoft::MSR::CNTK::StreamDescriptionPtr> m_compositeDataReaderStreamDescs;
+
+        // For now reusing the shim to allow prefetch.
+        // Please only use a subset of the shim interface that includes
+        // Init()/StartEpoch()/GetMinibatch()/IsEndOfEpoch()
+        // Shim will be deleted in the future versions.
+        std::shared_ptr<Microsoft::MSR::CNTK::ReaderShim<float>> m_shim;
+        Microsoft::MSR::CNTK::StreamMinibatchInputs m_matrices;
    };
 }
--- a/Source/Common/Include/Basics.h
+++ b/Source/Common/Include/Basics.h
@ -9,6 +9,7 @@

 #include "Platform.h"
 #include "ExceptionWithCallStack.h"
+#include <cmath>
 #include <string>
 #include <vector>
 #include <assert.h>
@ -25,7 +26,7 @@
 #define TWO_PI 6.283185307f // TODO: find the official standards-confirming definition of this and use it instead

 #define EPSILON 1e-5
-#define ISCLOSE(a, b, threshold) (abs(a - b) < threshold) ? true : false
+#define ISCLOSE(a, b, threshold) (std::abs(a - b) < threshold) ? true : false
 #define DLCLOSE_SUCCESS 0

 #define UNUSED(x) (void)(x) // for variables that are, e.g., only used in _DEBUG builds
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -524,7 +524,7 @@ void ComputationNetwork::CollectInputAndLearnableParametersRec(const Computation
 }

 template <class ElemType>
-/*static*/ void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase)
+/*static*/ void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate)
 {
    list<ComputationNodeBasePtr> dropoutNodes = net->GetNodesWithType(OperationNameOf(DropoutNode), criterionNode);
    if (dropoutRate != prevDropoutRate)
@ -535,21 +535,35 @@ template <class ElemType>
            fprintf(stderr, "WARNING: Attempting to set dropout rate, but there is no dropout node in the network.\n");
    }

-    // Each dropout node gets a distinct seed. The actual seed for each dropout node is computed as follows:
-    // seed = (((parallelWorkerIdx * maxEpochs) + currentEpochNum) /*i.e. randSeedBase*/ * dropoutNodes.size()) + dropoutNodeIdx
-    size_t randSeed = randSeedBase * dropoutNodes.size();
    for (auto& nodeIter : dropoutNodes)
    {
        auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(nodeIter);
        if (dropoutRate != prevDropoutRate)
            node->SetDropoutRate(dropoutRate);
-        node->SetRandomSeed(randSeed);
-        randSeed++;
    }

    prevDropoutRate = dropoutRate;
 }

+template <class ElemType>
+/* static */ void ComputationNetwork::SetIRngUserSeed(ComputationNetworkPtr net, const ComputationNodeBasePtr& node, size_t randSeedBase)
+{
+    // Predicate checking if the node is derived from IRngUser
+    function<bool(const ComputationNodeBasePtr&)> nodeIsIRngUser = [](const ComputationNodeBasePtr& p) { return dynamic_pointer_cast<IRngUser>(p) != nullptr; };
+
+    list<ComputationNodeBasePtr> rngUserNodes = net->GetNodesWhere(nodeIsIRngUser, node);
+
+    // Each IRngUser gets a distinct seed. This seed is computed as follows:
+    // seed = (((parallelWorkerIdx * maxEpochs) + currentEpochNum) /*i.e. randSeedBase*/ * rngUserNodes.size()) + dropoutNodeIdx.
+    size_t randSeed = randSeedBase * rngUserNodes.size();
+    for (auto& nodeIter : rngUserNodes)
+    {
+        auto rngUser = dynamic_pointer_cast<IRngUser>(nodeIter);
+        rngUser->SetRandomSeed(randSeed);
+        randSeed++;
+    }
+}
+
 template <class ElemType>
 /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
                                                                       double normalizationTimeConstant, double& prevNormalizationTimeConstant,
@ -1490,7 +1504,8 @@ template void ComputationNetwork::InitLearnableParametersWithBilinearFill<float>
 template void ComputationNetwork::Read<float>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
-template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);
+template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate);
+template /*static*/ void ComputationNetwork::SetIRngUserSeed<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, size_t randSeedBase);
 template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                     const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
@ -1500,7 +1515,8 @@ template void ComputationNetwork::InitLearnableParametersWithBilinearFill<double
 template void ComputationNetwork::Read<double>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
-template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);
+template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate);
+template /*static*/ void ComputationNetwork::SetIRngUserSeed<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, size_t randSeedBase);
 template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                      const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -446,8 +446,11 @@ public:

    // TODO: Why are all these static, but then take a network as the first argument? --> make them class members
    template <class ElemType>
-    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);
+    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate);

+    template <class ElemType>
+    static void SetIRngUserSeed(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, size_t randSeedBase);
+    
    template <class ElemType>
    static void SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, 
                                                   double normalizationTimeConstant, double& prevNormalizationTimeConstant,
@ -652,18 +655,19 @@ public:
        return std::vector<ComputationNodeBasePtr>(outputNodes.begin(), outputNodes.end());
    }

-    std::list<ComputationNodeBasePtr> GetNodesWithType(const wstring typeName, const ComputationNodeBasePtr& rootNode = nullptr)
+    std::list<ComputationNodeBasePtr> GetNodesWhere(std::function<bool(const ComputationNodeBasePtr&)>& predicate, const ComputationNodeBasePtr& rootNode = nullptr) const
    {
-        std::list<ComputationNodeBasePtr> nodesWithType;
+        std::list<ComputationNodeBasePtr> filteredNodes;

        // find nodes from all available nodes
+        // TODO: This distinction should not be necessary anymore. Calling GetEvalOrder(nullptr) will have the same effect.
        if (rootNode == nullptr)
        {
            for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
            {
                ComputationNodeBasePtr node = nodeIter->second;
-                if (node->OperationName() == typeName)
-                    nodesWithType.push_back(node);
+                if (predicate(node))
+                    filteredNodes.push_back(node);
            }
        }
        else
@ -671,12 +675,18 @@ public:
            // for calculating a specific node
            for (const auto& node : GetEvalOrder(rootNode)) // TODO: verify that no use of this requires the actual eval order, then change to GetAllNodesForRoot()
            {
-                if (node->OperationName() == typeName)
-                    nodesWithType.push_back(node);
+                if (predicate(node))
+                    filteredNodes.push_back(node);
            }
        }

-        return nodesWithType;
+        return filteredNodes;
+    }
+
+    std::list<ComputationNodeBasePtr> GetNodesWithType(const wstring typeName, const ComputationNodeBasePtr& rootNode = nullptr) const
+    {
+        std::function<bool(const ComputationNodeBasePtr&)> predicate = [typeName](const ComputationNodeBasePtr& node) { return node->OperationName() == typeName; };
+        return GetNodesWhere(predicate, rootNode);
    }

 public:
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -91,6 +91,8 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode))     return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PassNode))                             return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PlusNode))                             return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(RandomSampleNode))                     return New<RandomSampleNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(RandomSampleInclusionFrequencyNode))   return New<RandomSampleInclusionFrequencyNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReconcileDynamicAxisNode))             return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReciprocalNode))                       return New<ReciprocalNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RectifiedLinearNode))                  return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
@ -812,6 +814,18 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowSt
    return net.AddNodeToNetAndAttachInputs(New<RowStackNode<ElemType>>(net.GetDeviceId(), nodeName), { inputs });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RandomSample(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<RandomSampleNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RandomSampleInclusionFrequency(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<RandomSampleInclusionFrequencyNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
 #ifdef COMING_SOON
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::GMMLogLikelihood(const ComputationNodePtr unnormedPrior,
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -157,6 +157,8 @@ public:
    ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
    ComputationNodePtr Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Reciprocal(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr RandomSample(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr RandomSampleInclusionFrequency(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr RectifiedLinear(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Reshape(const ComputationNodePtr a, const TensorShape& imageLayout, const std::wstring nodeName = L"");
    ComputationNodePtr RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -127,6 +127,7 @@
    <ClCompile Include="RNNNodes.cpp" />
    <ClCompile Include="SpecialPurposeNodes.cpp" />
    <ClCompile Include="stdafx.cpp" />
+    <ClCompile Include="TrainingNodes.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@ -46,6 +46,9 @@
    <ClCompile Include="RecurrentNodes.cpp">
      <Filter>Nodes</Filter>
    </ClCompile>
+    <ClCompile Include="TrainingNodes.cpp">
+      <Filter>Nodes</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\fileutil.h">
--- a/Source/ComputationNetworkLib/TrainingNodes.cpp
+++ b/Source/ComputationNetworkLib/TrainingNodes.cpp
@ -0,0 +1,225 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "TrainingNodes.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+void RandomSampleNodeBase<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+{
+    Base::CopyTo(nodeP, newName, flags);
+    if (flags & CopyNodeFlags::copyNodeValue)
+    {
+        auto node = dynamic_pointer_cast<RandomSampleNodeBase<ElemType>>(nodeP);
+        node->m_allowDuplicates           = m_allowDuplicates;
+        node->m_sizeOfSampledSet          = m_sizeOfSampledSet;
+        node->m_randomSeed                = m_randomSeed;
+    }
+}
+
+template<class ElemType>
+void RandomSampleNodeBase<ElemType>::Save(File& fstream) const
+{
+    Base::Save(fstream);
+    fstream << m_allowDuplicates;
+    fstream << m_sizeOfSampledSet;
+}
+
+template<class ElemType>
+void RandomSampleNodeBase<ElemType>::Load(File& fstream, size_t modelVersion)
+{
+    Base::Load(fstream, modelVersion);
+    fstream >> m_allowDuplicates;
+    fstream >> m_sizeOfSampledSet;
+}
+
+template<class ElemType>
+void RandomSampleNodeBase<ElemType>::UpdateWeightsPrefixSum()
+{
+    const Matrix<ElemType>& samplingWeights = Input(0)->ValueAsMatrix();
+    m_samplingWeightsPrefixSum.clear();
+    double runningWeightsSum = 0;
+    for (int iClass = 0; iClass < samplingWeights.GetNumRows(); iClass++)
+    {
+        ElemType currentWeight = samplingWeights.GetValue(iClass, 0);
+        runningWeightsSum += currentWeight;
+        m_samplingWeightsPrefixSum.push_back(runningWeightsSum);
+    }
+}
+
+// Runs the sampling returning a vector with the id's of the samples. The parameter nTries is used to return the number of draws that was needed
+// to get the expected number of samples.
+template<class ElemType>
+const std::vector<size_t> RandomSampleNodeBase<ElemType>::RunSampling(long& nTries)
+{
+    std::uniform_real_distribution<double> r(0, m_samplingWeightsPrefixSum.back());
+    std::unordered_set<int> alreadySampled;
+    std::vector<size_t> samples;
+    CPURNGHandle* cpuRNGHandle = dynamic_cast<CPURNGHandle*>(&GetRNGHandle(CPUDEVICE)); 
+    // find random samples using the specified weight
+
+    if (m_allowDuplicates)
+        nTries = m_sizeOfSampledSet;
+    else
+        nTries = 0; // just initialize and count how many tries we need.
+
+    while (samples.size() < m_sizeOfSampledSet)
+    {
+        double randomValue = r(cpuRNGHandle->Generator());
+        // Find the first index where value[idx] >= randomValue.
+        auto lower = std::lower_bound(m_samplingWeightsPrefixSum.begin(), m_samplingWeightsPrefixSum.end(), randomValue);
+        int idx = (int)(lower - m_samplingWeightsPrefixSum.begin());
+
+        if (m_allowDuplicates)
+            samples.push_back(idx);
+        else
+        {
+            // Sampling without replacement: each value can be sampled at most once. 
+            // The implementation below using rejection sampling is problematic.
+            // E.g if first class has probability p = 0.999 we typically will have to sample 1000 times or more to hit another class.
+            // BUGBUG Alternative implementions, e.g:
+            // * Weighted Random Sampling with Reservoir: http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf
+            // * Binary tree with classes as leafes and branch probs on non-leafes.
+            // * As in numpy: https://github.com/numpy/numpy/blob/master/numpy/random/mtrand/mtrand.pyx#L1440
+            nTries++;
+            if (alreadySampled.find(idx) != alreadySampled.end()) continue;
+            else
+            {
+                samples.push_back(idx);
+                alreadySampled.insert(idx);
+            }
+        }
+    }
+    return samples;
+}
+
+template<class ElemType>
+void RandomSampleNode<ElemType>::ForwardPropNonLooping()
+{
+    Base::UpdateWeightsPrefixSum();
+    Matrix<ElemType>& valueMatrix = ValueAsMatrix();
+    valueMatrix.TransferToDeviceIfNotThere(CPUDEVICE, /*ismoved =*/ true/*means: BOTH state not ok */, /*emptyTransfer =*/ true, /*updatePreferredDevice =*/ false);
+    valueMatrix.SetDevice(CPUDEVICE);
+
+    //BUGBUG: matrix type should be configured during validation
+    valueMatrix.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC, false);
+    valueMatrix.Reset();
+
+    // Get vector with indices of randomly sampled classes
+    const std::vector<size_t> samples = GetWeightedSamples();
+
+    // Set columns of (sparse) result matrix as indicator vectors
+    for (size_t i = 0; i < Base::m_sizeOfSampledSet; i++)
+    {
+        int sample = samples[i];
+        valueMatrix.SetValue(sample, i, 1);
+    }
+}
+
+template<class ElemType>
+const std::vector<size_t> RandomSampleNode<ElemType>::GetWeightedSamples()
+{
+    long dummy;
+    // Here we are not interested in the number of sampling tries needed, which is returned in the parameter.
+    return Base::RunSampling(dummy);
+}
+
+template<class ElemType>
+void RandomSampleNode<ElemType>::Validate(bool isFinalValidationPass)
+{
+    Base::Validate(isFinalValidationPass);
+    m_pMBLayout = nullptr;
+
+    let& shape = Input(0)->GetSampleLayout();
+    let dims = shape.GetDims();
+    size_t nClasses = dims[0];
+
+    // Output: a (sparse) matrix containing m_sizeOfSampledSet columns of 1-hot vectors specifiying the sampled classes.
+    SetDims(TensorShape(nClasses, Base::m_sizeOfSampledSet), false);
+}
+
+template<class ElemType>
+bool RandomSampleNode<ElemType>::IsOutOfDateWrtInputs() const
+{
+    // If we are in the mode to generate random samples (i.e. m_estimateInSampleFrequency == false) 
+    // we need to recompute the result for each mini-batch even if the weight vector didn't change.
+    return true;
+}
+
+template<class ElemType>
+double RandomSampleInclusionFrequencyNode<ElemType>::EstimateNumberOfTries()
+{
+    // We estimate the average numver of tries by repeating a fixed number of experiments
+    const size_t numExperiments = 10; // We choose 10 without any deep justification.
+    long totalTries = 0;
+    for (int iExperiment = 0; iExperiment < numExperiments; iExperiment++)
+    {
+        long nTries;
+        Base::RunSampling(nTries);
+        totalTries += nTries;
+    }
+    return totalTries / (double)numExperiments;
+}
+
+// Estimates the expected number of occurences of each class in the sampled set.
+// For sampling without replacement we use estimate using average number of tries. (Inspired by TensorFlow)
+// BUGBUG: Consider to reimplement using a less biased estimate as proposed by Nikos.
+template<class ElemType>
+double RandomSampleInclusionFrequencyNode<ElemType>::EstimateInSampleFrequency(double p, double estimatedNumTries) const
+{
+    if (Base::m_allowDuplicates)
+    {
+        return p * Base::m_sizeOfSampledSet;
+    }
+    else /* No duplicates allowed. Estimated count is same as probability of inclusion. */
+    {
+        return -expm1(estimatedNumTries * log1p(-p));
+    }
+}
+
+template<class ElemType>
+void RandomSampleInclusionFrequencyNode<ElemType>::ForwardPropNonLooping()
+{
+    Base::UpdateWeightsPrefixSum();
+    Matrix<ElemType>& valueMatrix = ValueAsMatrix();
+    valueMatrix.TransferToDeviceIfNotThere(CPUDEVICE, /*ismoved =*/ true/*means: BOTH state not ok */, /*emptyTransfer =*/ true, /*updatePreferredDevice =*/ false);
+    valueMatrix.SetDevice(CPUDEVICE);
+
+    //BUGBUG: matrix type should be configured during validation
+    valueMatrix.SwitchToMatrixType(DENSE, matrixFormatDense, false);
+    double sumOfWeights = Base::m_samplingWeightsPrefixSum.back();
+    const Matrix<ElemType>& samplingWeights = Input(0)->ValueAsMatrix();
+
+    double estimatedNumTries = EstimateNumberOfTries();
+
+    for (int i = 0; i < Base::m_samplingWeightsPrefixSum.size(); i++)
+    {
+        // Get the sampling probablility for from the weights for i-th class.
+        double samplingProb = samplingWeights.GetValue(i, 0) / sumOfWeights;
+        double estimatedCount = EstimateInSampleFrequency(samplingProb, estimatedNumTries);
+        valueMatrix.SetValue(i, 0, (ElemType)estimatedCount);
+    }
+}
+
+template<class ElemType>
+void RandomSampleInclusionFrequencyNode<ElemType>::Validate(bool isFinalValidationPass)
+{
+    Base::Validate(isFinalValidationPass);
+    m_pMBLayout = nullptr;
+
+    let& shape = Input(0)->GetSampleLayout();
+    let dims = shape.GetDims();
+    size_t nClasses = dims[0];
+
+    // Output: one vector containing the estimated in sample frequency for each class.
+    SetDims(TensorShape(nClasses, 1), false);
+}
+
+template class RandomSampleNode<float>;
+template class RandomSampleNode<double>;
+template class RandomSampleInclusionFrequencyNode<float>;
+template class RandomSampleInclusionFrequencyNode<double>;
+}}}
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -8,6 +8,8 @@
 #include "ComputationNode.h"
 #include "BatchNormalizationEngine.h"
 #include "RNGHandle.h"
+#include "CPURNGHandle.h"
+

 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
@ -17,6 +19,7 @@
 #include <stdexcept>
 #include <list>
 #include <memory>
+#include <random>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -1141,6 +1144,179 @@ private:
 template class NoiseContrastiveEstimationNode<float>;
 template class NoiseContrastiveEstimationNode<double>;

+
+
+// Nodes using a random number generators should derive from this interface.
+// One purpuose of this interface is to have a common interface for setting the seeds when setting up a network.
+class IRngUser
+{
+public:
+    virtual RNGHandle& GetRNGHandle(DEVICEID_TYPE deviceId) = 0;
+    virtual void SetRandomSeed(const unsigned long val) = 0;
+};
+
+// This implements IRngUser using RNGHandle.
+class RngUser : public IRngUser
+{
+public:
+    RNGHandle& GetRNGHandle(DEVICEID_TYPE deviceId) override
+    {
+        if (!m_RNGHandle)
+            m_RNGHandle = RNGHandle::Create(deviceId, m_randomSeed);
+
+        return *m_RNGHandle;
+    }
+
+    // E.g. called from ComputationNetwork to make sure that CNTK running on different nodes will have different seed.
+    void SetRandomSeed(const unsigned long val) override
+    {
+        m_randomSeed = (unsigned long)val;
+
+        m_RNGHandle.reset(); // Reset handle. New handle will be generated with next call of GetRNGHandle(...).
+    }
+
+protected:
+    unsigned long m_randomSeed = 0;
+    std::shared_ptr<RNGHandle> m_RNGHandle;
+};
+
+// ------------------------------------------------------------------------------------------------------------------------------------------------
+// RandomSampleNodeBase(samplingWeights, sizeOfSampledSet, allowDuplicates): 
+// Base class for RandomSampleNode and RandomSampleInclusionFrequencyNode.
+// Provides random sampling functionality.
+//
+// Parameters:
+// * Input(0) Sampling weight vector: Matrix of shape (nClasses x 1) providing sampling weights >= 0.
+// * sizeOfSampledSet: Size of the sampled set.
+// * allowDuplicates: controls if sampled set is allowed to contain duplicates.
+// --------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <class ElemType>
+class RandomSampleNodeBase : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>, public RngUser
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName(){return L"RandomSampleNodeBase";}
+
+public:
+    RandomSampleNodeBase(DEVICEID_TYPE deviceId, const wstring& name, int sizeOfSampledSet = 0, bool allowDuplicates = false)
+        : Base(deviceId, name), m_sizeOfSampledSet(sizeOfSampledSet), m_allowDuplicates(allowDuplicates)
+    {
+        SetRandomSeed((unsigned long)CreateUniqId());
+    }
+
+    RandomSampleNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
+        : RandomSampleNodeBase(CPUDEVICE, L"<placeholder>", configp->Get(L"sizeOfSampledSet"), configp->Get(L"allowDuplicates"))
+    {
+        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
+    }
+
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override;
+
+    void Save(File& fstream) const;
+
+    virtual void Load(File& fstream, size_t modelVersion) override;
+
+protected:
+
+    void UpdateWeightsPrefixSum();
+
+    // Runs the sampling returning a vector with the id's of the samples. The parameter nTries is used to return the number of draws that was needed
+    // to get the expected number of samples.
+    const std::vector<size_t> RunSampling(long& nTries);
+
+public:
+    virtual void /*ComputationNode::*/ BackpropToNonLooping(size_t inputIndex) override {
+        // This node does not propagate gradients.
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override{}
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false;}
+    virtual void /*ComputationNode::*/ ForwardPropNonLooping() override{}
+
+protected:
+    bool m_allowDuplicates; // The node can create samples allowing for duplicates (sampling with replacement) or not (sampling without replacement).
+    int m_sizeOfSampledSet; // Requested size of sample in case of run-mode = CREATE_SAMPLES.
+    std::vector<double> m_samplingWeightsPrefixSum;
+};
+
+// ------------------------------------------------------------------------------------------------------------------------------------------------
+// RandomSampleNode(samplingWeights, sizeOfSampledSet, allowDuplicates):
+// The node's value is a set of sizeOfSampledSet random samples represented as a (sparse) matrix of shape [nClasses x sizeOfSampledSet] where nClasses is the number of classes (categories) to choose from.
+// The output has no dynamic axis.
+// The samples are drawn according to the weight vector p(w_i) = w_i / sum_k(w_k)
+// We get one set of samples for per minibatch.
+// Intended uses are e.g. sampled softmax, noise contrastive estimation etc.
+//
+// Parameters:
+// * Input(0): Sampling weight vector. Matrix of shape (nClasses x 1) providing sampling weights >= 0.
+// * sizeOfSampledSet: Size of the sampled set.
+// * allowDuplicates: controls if sampled set is allowed to contain duplicates.
+// --------------------------------------------------------------------------------------------------------------------------------------------------
+template<class ElemType> 
+class RandomSampleNode : public RandomSampleNodeBase<ElemType>
+{
+    typedef RandomSampleNodeBase<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName(){ return L"RandomSample"; }
+
+public:
+    RandomSampleNode(DEVICEID_TYPE deviceId, const wstring& name, int sizeOfSampledSet = 0, bool allowDuplicates = false)
+        : Base(deviceId, name, sizeOfSampledSet, allowDuplicates)
+    {}
+
+    RandomSampleNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : RandomSampleNode(CPUDEVICE, L"<placeholder>", configp->Get(L"sizeOfSampledSet"), configp->Get(L"allowDuplicates"))
+    {
+        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
+    }
+
+    virtual void /*ComputationNode::*/ ForwardPropNonLooping() override;
+    const std::vector<size_t> GetWeightedSamples();
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override;
+    virtual bool IsOutOfDateWrtInputs() const override;
+};
+
+// ------------------------------------------------------------------------------------------------------------------------------------------------
+// RandomSampleInclusionFrequencyNode(samplingWeights, sizeOfSampledSet, allowDuplicates): 
+// Intended uses are e.g. sampled softmax, noise contrastive estimation etc where it is used together with RandomSampleNode.
+// This node estimates how often each class will occur in a set sampled with RandomSampleNode(...) on the average. 
+// If the sampling mode 'allowDuplicates = true' is choosen this is trivial and exact. 
+// For allowDuplicates = false we get some estimate. The value is updated only when the input weights change.
+//
+// Parameters:
+// * Input(0): Sampling weight vector. Matrix of shape (nClasses x 1) providing sampling weights >= 0.
+// * sizeOfSampledSet: Size of the sampled set.
+// * allowDuplicates: controls if sampled set is allowed to contain duplicates.
+// --------------------------------------------------------------------------------------------------------------------------------------------------
+template<class ElemType>
+class RandomSampleInclusionFrequencyNode : public RandomSampleNodeBase<ElemType>
+{
+    typedef RandomSampleNodeBase<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName(){ return L"RandomSampleInclusionFrequency"; }
+public:
+    RandomSampleInclusionFrequencyNode(DEVICEID_TYPE deviceId, const wstring& name, int sizeOfSampledSet = 0, bool allowDuplicates = false)
+        : Base(deviceId, name, sizeOfSampledSet, allowDuplicates)
+    {}
+
+    RandomSampleInclusionFrequencyNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : RandomSampleInclusionFrequencyNode(CPUDEVICE, L"<placeholder>", configp->Get(L"sizeOfSampledSet"), configp->Get(L"allowDuplicates"))
+    {
+        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
+    }
+    virtual void /*ComputationNode::*/ ForwardPropNonLooping() override;
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override;
+private:
+    // Approximates the expected number of occurences of a class in the sampled set.
+    // Assuming (falsely) that the number of tries to get a sampled set with the requested number of distinct values is always estimatedNumTries
+    // the probability that a specific class in in the sampled set is (1 - (1-p)^estimatedNumTries), where p is the probablity to pick the clas in one draw.
+    // The estimate can be quite a bit off but should be better than nothing. Better alternatives?
+    double EstimateInSampleFrequency(double p, double estimatedNumTries) const;
+
+    double EstimateNumberOfTries();
+};
+
 // -----------------------------------------------------------------------
 // ClassBasedCrossEntropyWithSoftmaxNode (labeldata(.,t), inputdata(.,t), embeddingMatrix, clsProbBeforeSoftmaxData(.,t))
 //  - Input(0) [4 x T] label in dense matrix in
@ -1152,7 +1328,6 @@ template class NoiseContrastiveEstimationNode<double>;
 //  - Input(2) [hdsize x vocab_size] weight matrix in, for speed-up, as per word matrix can be simply obtained as column slice
 //  - Input(3) [nbr_cls x T] clsprob in dense matrix in. This input, if applied softmax on, is the posterior probabilty of class given observations
 // -----------------------------------------------------------------------
-
 // calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history
 // need to provide class probabilty from external node
 template <class ElemType>
@ -1888,7 +2063,7 @@ template class LogisticNode<double>;
 // -----------------------------------------------------------------------

 template <class ElemType>
-class DropoutNode : public ComputationNode<ElemType>, public NumInputs<1>
+class DropoutNode : public ComputationNode<ElemType>, public NumInputs<1>, public RngUser
 {
    typedef ComputationNode<ElemType> Base;
    UsingComputationNodeMembersBoilerplate;
@ -1903,7 +2078,7 @@ public:
        : Base(deviceId, name),
          m_dropoutRate(0)
    {
-        m_randomSeed = (unsigned long) CreateUniqId();
+        SetRandomSeed((unsigned long)CreateUniqId());
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
@ -1960,21 +2135,9 @@ public:
        m_dropoutRate = val;
    }

-    void SetRandomSeed(const unsigned long val)
-    {
-        m_randomSeed = (unsigned long) val;
-
-        // Upon change of the seed, reset RNGHandle to force the creation of a new RNGHandle
-        // during forward propagation
-        m_RNGHandle = nullptr;
-    }
-
    RNGHandle& GetRNGHandle()
    {
-        if (m_RNGHandle == nullptr) 
-            m_RNGHandle = RNGHandle::Create(ValuePtr()->GetDeviceId(), m_randomSeed);
-
-        return *m_RNGHandle;
+        return RngUser::GetRNGHandle(ValuePtr()->GetDeviceId());
    }

    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -1984,7 +2147,7 @@ public:
        {
            auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(nodeP);
            node->m_dropoutRate = m_dropoutRate;
-            node->m_randomSeed = m_randomSeed;
+            node->SetRandomSeed(m_randomSeed);
            node->m_maskOfDropout = m_maskOfDropout;
        }
    }
@ -2006,9 +2169,6 @@ public:

 private:
    double m_dropoutRate;
-    unsigned long m_randomSeed;
-    std::shared_ptr<RNGHandle> m_RNGHandle;
-
    shared_ptr<Matrix<ElemType>> m_maskOfDropout;
 };

--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1247,7 +1247,7 @@ void Matrix<ElemType>::AssignValuesOf(const Matrix<ElemType>& deepCopyFrom)
                    deepCopyFrom.m_CPUSparseMatrix->AssignColumnSliceToDense(tempCPUDenseMatrix, 0, deepCopyFrom.GetNumCols());
                    m_GPUMatrix->SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), this->GetDeviceId(), tempCPUDenseMatrix.Data());
                },//{ m_GPUMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix); },
-                { LogicError("AssignValuesOf: Assigning a GPUSparseMatrix to a GPUMatrix is not yet implemented."); });//{ m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix); });
+                { deepCopyFrom.m_GPUSparseMatrix->AssignColumnSliceToDense(*m_GPUMatrix, 0, deepCopyFrom.GetNumCols()); });
        },
        { 
            // Set CPUSparseMatrix from:
@ -4471,12 +4471,25 @@ void Matrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemT
    {
        if (a.GetMatrixType() == MatrixType::SPARSE) // CPU, SPARSE * ANY -> ANY
        {
-            if (b.GetMatrixType() == MatrixType::DENSE && c.GetMatrixType() == MatrixType::DENSE) // CPU, SPARSE * DENSE -> DENSE
+            if (b.GetMatrixType() == MatrixType::DENSE      && c.GetMatrixType() == MatrixType::DENSE) // CPU, SPARSE * DENSE  -> DENSE
            {
                CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_CPUSparseMatrix, transposeA, *b.m_CPUMatrix, transposeB, beta, *c.m_CPUMatrix);
                c.SetDataLocation(CPU, DENSE);
            }
-            else{
+            else if (b.GetMatrixType() == MatrixType::SPARSE && c.GetMatrixType() == MatrixType::DENSE) // CPU, SPARSE * SPARSE -> DENSE
+            {
+                NOT_IMPLEMENTED;
+            }
+            else if (b.GetMatrixType() == MatrixType::DENSE  && c.GetMatrixType() == MatrixType::SPARSE)// CPU, SPARSE * DENSE  -> SPARSE
+            {
+                NOT_IMPLEMENTED;
+            }
+            else if (b.GetMatrixType() == MatrixType::SPARSE && c.GetMatrixType() == MatrixType::SPARSE)// CPU, SPARSE * SPARSE -> SPARSE
+            {
+                NOT_IMPLEMENTED;
+            }
+            else
+            {
                NOT_IMPLEMENTED;
            }
        }
--- a/Source/Readers/ReaderLib/ElementTypeUtils.h
+++ b/Source/Readers/ReaderLib/ElementTypeUtils.h
@ -19,7 +19,7 @@ inline size_t GetSizeByType(ElementType type)
    case ElementType::tdouble:
        return sizeof(double);
    default:
-        RuntimeError("Unsupported type '%d'", type);
+        RuntimeError("Unsupported type '%d'", static_cast<int>(type));
    }
 }

--- a/Source/Readers/ReaderLib/ReaderShim.cpp
+++ b/Source/Readers/ReaderLib/ReaderShim.cpp
@ -23,7 +23,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
 ReaderShim<ElemType>::ReaderShim(ReaderFactory factory)
-    : m_factory(factory), m_deviceId(CPUDEVICE), m_dataTransferers(2, DataTransfererPtr()), m_currentDataTransferIndex(0)
+    : m_factory(factory), m_deviceId(CPUDEVICE), m_dataTransferers(2, DataTransfererPtr()), m_currentDataTransferIndex(0), m_endOfEpoch(false)
+{
+}
+
+template <class ElemType>
+ReaderShim<ElemType>::ReaderShim(ReaderPtr reader)
+    : m_deviceId(CPUDEVICE), m_dataTransferers(2, DataTransfererPtr()), m_currentDataTransferIndex(0), m_reader(reader), m_factory(nullptr), m_endOfEpoch(false)
 {
 }

@ -40,7 +46,9 @@ void ReaderShim<ElemType>::Init(const ConfigParameters& config)

    m_numParallelSequences = numberOfuttsPerMinibatchForAllEpochs[0];

-    m_reader = m_factory(config);
+    if (!m_reader)
+        m_reader = m_factory(config);
+
    m_streams = m_reader->GetStreamDescriptions();
    for (auto i : m_streams)
    {
@ -63,12 +71,6 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    const std::unordered_set<InputStreamDescription>& inputs,
    size_t requestedEpochSamples /*= requestDataSize*/)
 {
-    // For adaptive minibatch, make sure there are no outstanding reads.
-    if (m_prefetchTask.valid())
-    {
-        m_prefetchTask.wait();
-    }
-
    EpochConfiguration config;
    config.m_workerRank = subsetNum;
    config.m_numberOfWorkers = numSubsets;
@ -76,6 +78,18 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    config.m_totalEpochSizeInSamples = requestedEpochSamples;
    config.m_epochIndex = epoch;

+    StartEpoch(config, inputs);
+}
+
+template <class ElemType>
+void ReaderShim<ElemType>::StartEpoch(const EpochConfiguration& config, const std::unordered_set<InputStreamDescription>& inputs)
+{
+    // For adaptive minibatch, make sure there are no outstanding reads.
+    if (m_prefetchTask.valid())
+    {
+        m_prefetchTask.wait();
+    }
+
    // Let's check that there is no outstanding copies.
    // Wait on all events if there are any pending copy operations in flight.
    if (m_dataTransferers[m_currentDataTransferIndex])
@ -114,7 +128,7 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
        m_prefetchBuffers[i.GetStreamName()] = StreamPrefetchBuffer
        {
            std::make_shared<Matrix<ElemType>>(0, 0, i.GetDeviceId(), i.GetMatrixType(), i.GetMatrixFormat()),
-            nullptr
+            std::make_shared<MBLayout>()
        };
    }

@ -263,6 +277,10 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
 template <class ElemType>
 typename ReaderShim<ElemType>::PrefetchResult ReaderShim<ElemType>::PrefetchMinibatch(size_t currentDataTransferIndex)
 {
+    // Resetting layouts.
+    for (auto& mx : m_prefetchBuffers)
+        mx.second.m_mbLayout = std::make_shared<MBLayout>();
+
    Minibatch minibatch = m_reader->ReadMinibatch();

    // If there is no data we can simply return.
--- a/Source/Readers/ReaderLib/ReaderShim.h
+++ b/Source/Readers/ReaderLib/ReaderShim.h
@ -29,6 +29,8 @@ class ReaderShim : public IDataReader
    friend class ::CNTK::CompositeMinibatchSource;
 public:
    explicit ReaderShim(ReaderFactory factory);
+    explicit ReaderShim(ReaderPtr reader);
+
    virtual ~ReaderShim() { }

    virtual void Init(const ScriptableObjects::IConfigRecord& /*config*/) override
@ -54,6 +56,8 @@ public:
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, const std::unordered_set<InputStreamDescription>& inputs, size_t requestedEpochSamples = requestDataSize) override;
    virtual void StartDistributedMinibatchLoop(size_t requestedMBSize, size_t epoch, size_t subsetNum, size_t numSubsets, const std::unordered_set<InputStreamDescription>& inputs, size_t requestedEpochSamples) override;

+    void StartEpoch(const EpochConfiguration& epoch, const std::unordered_set<InputStreamDescription>& inputs);
+
    virtual void StartMinibatchLoop(size_t, size_t, size_t) override
    {
        LogicError("Legacy StartMinibatchLoop is not implemented.");
@ -84,6 +88,11 @@ public:

    virtual size_t GetCurrentSamplePosition() override;

+    bool IsEndOfEpoch() const
+    {
+        return m_endOfEpoch;
+    }
+
 private:
    struct PrefetchResult
    {
--- a/Source/Readers/ReaderLib/TruncatedBpttPacker.cpp
+++ b/Source/Readers/ReaderLib/TruncatedBpttPacker.cpp
@ -6,6 +6,7 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _SCL_SECURE_NO_WARNINGS

+#include <cmath>
 #include <deque>
 #include "TruncatedBpttPacker.h"
 #include "ElementTypeUtils.h"
@ -148,7 +149,7 @@ void TruncatedBPTTPacker::StartEpoch(const EpochConfiguration& config, const std
        }

        // Estimating the number of parallel sequences to pack (slots) from the minibatch size and truncation size.
-        m_numParallelSequences = max(1, (int)floor(m_minibatchSize / m_truncationSize));
+        m_numParallelSequences = max(1, static_cast<int>(std::floor(m_minibatchSize / m_truncationSize)));

        if (config.m_numberOfWorkers > m_numParallelSequences)
        {
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -417,8 +417,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
        // set dropout rate for this epoch
        // We use the same seed across workers until parallel training kicks in to ensure that the workers have identical models
        size_t parallelWorkerIdx = ((m_mpi == nullptr) || !UsingParallelTrain(i)) ? 0 : m_mpi->CurrentNodeRank();
-        size_t dropoutRandSeedBase = (parallelWorkerIdx * m_maxEpochs) + i;
-        ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropoutRandSeedBase);
+        size_t randSeedBase = (parallelWorkerIdx * m_maxEpochs) + i;
+        ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate);
+        ComputationNetwork::SetIRngUserSeed<ElemType>(net, criterionNodes[0], randSeedBase);
        ComputationNetwork::SetBatchNormalizationTimeConstants<ElemType>(net, criterionNodes[0], 
                                                                         m_batchNormalizationTimeConstant[i], prevNormalizationTimeConstant,
                                                                         m_batchNormalizationBlendTimeConstant[i], prevNormalizationBlendTimeConstant);
--- a/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
--- a/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
@ -807,6 +807,23 @@ BOOST_AUTO_TEST_CASE(CompositeCNTKTextFormatReader_5x5_and_5x10_jagged_minibatch
        false);
 };

+BOOST_AUTO_TEST_CASE(CNTKTextFormatReaderNoFirstMinibatchData)
+{
+    HelperRunReaderTest<double>(
+        testDataPath() + "/Config/CNTKTextFormatReader/dense.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/NonExistent.txt",
+        testDataPath() + "/Control/CNTKTextFormatReader/CNTKTextFormatReaderNoFirstMinibatchData_Output.txt",
+        "1x2",
+        "reader",
+        10, // epoch size
+        1,  // mb size
+        10,  // num epochs
+        1,
+        0,
+        1,
+        2);
+};
+
 BOOST_AUTO_TEST_SUITE_END()

 } } } }
--- a/Tools/make_binary_drop_linux
+++ b/Tools/make_binary_drop_linux
@ -113,7 +113,7 @@ declare -a mklFiles=("libmkl_cntk_p.so" "libiomp5.so")
 declare -a opencvFiles=("libopencv_core.so.3.1" "libopencv_imgproc.so.3.1" "libopencv_imgproc.so.3.1" "libopencv_imgcodecs.so.3.1")

 # libzip
-declare -a libzipFiles=("libzip.so")
+declare -a libzipFiles=("libzip.so.4")

 # CUDA
 declare -a cudaFiles=("libcudart.so.7.5" "libcublas.so.7.5" "libcurand.so.7.5" "libcusparse.so.7.5")
--- a/bindings/python/swig_install.sh
+++ b/bindings/python/swig_install.sh