Add TopK error evaluation

2015-09-10 15:24:26 -07:00 · 2015-09-10 15:24:26 -07:00 · fa4383e7f5
--- a/DataReader/ImageReader/ImageReader.cpp
+++ b/DataReader/ImageReader/ImageReader.cpp
@ -37,16 +37,31 @@ public:
 class CropTransform : public ITransform
 {
 public:
-    CropTransform(unsigned int seed) : m_rng(seed), m_rndUniInt(0, INT_MAX)
+    CropTransform(unsigned int seed) : m_rng(seed)
    {
    }

    void Init(const ConfigParameters& config)
    {
        m_cropType = ParseCropType(config("cropType", ""));
-        m_cropRatio = std::stof(config("cropRatio", "1"));
-        if (!(0 < m_cropRatio && m_cropRatio <= 1.0))
-            RuntimeError("Invalid cropRatio value: %f.", m_cropRatio);
+
+        std::stringstream ss{ config("cropRatio", "1") };
+        std::string token{ "" };
+        if (std::getline(ss, token, ':'))
+        {
+            m_cropRatioMin = std::stof(token);
+            m_cropRatioMax = std::getline(ss, token, ':') ? std::stof(token) : m_cropRatioMin;
+        }
+
+        if (!(0 < m_cropRatioMin && m_cropRatioMin <= 1.0) || 
+            !(0 < m_cropRatioMax && m_cropRatioMax <= 1.0) ||
+            m_cropRatioMin > m_cropRatioMax)
+        {
+            RuntimeError("Invalid cropRatio value, must be > 0 and <= 1. cropMin must <= cropMax");
+        }
+
+        m_jitterType = ParseJitterType(config("jitterType", ""));
+
        if (!config.ExistsCurrent("hflip"))
            m_hFlip = m_cropType == CropType::Random;
        else
@ -55,21 +70,44 @@ public:

    void Apply(cv::Mat& mat)
    {
-        mat = mat(GetCropRect(m_cropType, mat.rows, mat.cols, m_cropRatio));
-        if (m_hFlip && (m_rndUniInt(m_rng) % 2) != 0)
+        double ratio = 1;
+        switch (m_jitterType)
+        {
+        case RatioJitterType::None:
+            ratio = m_cropRatioMin;
+            break;
+        case RatioJitterType::UniRatio:
+            ratio = UniRealT(m_cropRatioMin, m_cropRatioMax)(m_rng);
+            assert(m_cropRatioMin <= ratio && ratio < m_cropRatioMax);
+            break;
+        default:
+            RuntimeError("Jitter type currently not implemented.");
+        }
+        mat = mat(GetCropRect(m_cropType, mat.rows, mat.cols, ratio));
+        if (m_hFlip && std::bernoulli_distribution()(m_rng))
            cv::flip(mat, mat, 1);
    }

 private:
+    using UniRealT = std::uniform_real_distribution<double>;
+    using UniIntT = std::uniform_int_distribution<int>;
+
    enum class CropType { Center = 0, Random = 1 };
+    enum class RatioJitterType
+    { 
+        None = 0,
+        UniRatio = 1,
+        UniLength = 2,
+        UniArea = 3
+    };
+
+    bool AreEqual(const std::string& s1, const std::string& s2)
+    {
+        return std::equal(s1.begin(), s1.end(), s2.begin(), [](const char& a, const char& b) { return std::tolower(a) == std::tolower(b); });
+    };

    CropType ParseCropType(const std::string& src)
    {
-        auto AreEqual = [](const std::string& s1, const std::string& s2) -> bool
-        {
-            return std::equal(s1.begin(), s1.end(), s2.begin(), [](const char& a, const char& b) { return std::tolower(a) == std::tolower(b); });
-        };
-
        if (src.empty() || AreEqual(src, "center"))
            return CropType::Center;
        if (AreEqual(src, "random"))
@ -78,6 +116,20 @@ private:
        RuntimeError("Invalid crop type: %s.", src.c_str());
    }

+    RatioJitterType ParseJitterType(const std::string& src)
+    {
+        if (src.empty() || AreEqual(src, "none"))
+            return RatioJitterType::None;
+        if (AreEqual(src, "uniratio"))
+            return RatioJitterType::UniRatio;
+        if (AreEqual(src, "unilength"))
+            return RatioJitterType::UniLength;
+        if (AreEqual(src, "uniarea"))
+            return RatioJitterType::UniArea;
+
+        RuntimeError("Invalid jitter type: %s.", src.c_str());
+    }
+
    cv::Rect GetCropRect(CropType type, int crow, int ccol, double cropRatio)
    {
        assert(crow > 0);
@ -87,7 +139,6 @@ private:
        int cropSize = static_cast<int>(std::min(crow, ccol) * cropRatio);
        int xOff = -1;
        int yOff = -1;
-
        switch (type)
        {
        case CropType::Center:
@ -95,8 +146,8 @@ private:
            yOff = (crow - cropSize) / 2;
            break;
        case CropType::Random:
-            xOff = m_rndUniInt(m_rng) % std::max(ccol - cropSize, 1);
-            yOff = m_rndUniInt(m_rng) % std::max(crow - cropSize, 1);
+            xOff = UniIntT(0, ccol - cropSize)(m_rng);
+            yOff = UniIntT(0, crow - cropSize)(m_rng);
            break;
        default:
            assert(false);
@ -108,18 +159,20 @@ private:
    }

 private:
-    std::default_random_engine m_rng;
-    std::uniform_int_distribution<int> m_rndUniInt;
+    // REVIEW alexeyk: currently not thread safe. Engines are expensive to create.
+    std::mt19937 m_rng;

    CropType m_cropType;
-    double m_cropRatio;
+    double m_cropRatioMin;
+    double m_cropRatioMax;
+    RatioJitterType m_jitterType;
    bool m_hFlip;
 };

 class ScaleTransform : public ITransform
 {
 public:
-    ScaleTransform(int dataType, unsigned int seed) : m_dataType(dataType), m_rng(seed), m_rndUniInt(0, INT_MAX)
+    ScaleTransform(int dataType, unsigned int seed) : m_dataType(dataType), m_rng(seed)
    {
        assert(m_dataType == CV_32F || m_dataType == CV_64F);

@ -160,12 +213,13 @@ public:

        assert(m_interp.size() > 0);
        cv::resize(mat, mat, cv::Size(static_cast<int>(m_imgWidth), static_cast<int>(m_imgHeight)), 0, 0, 
-            m_interp[m_rndUniInt(m_rng) % m_interp.size()]);
+            m_interp[UniIntT(0, static_cast<int>(m_interp.size()) - 1)(m_rng)]);
    }

 private:
-    std::default_random_engine m_rng;
-    std::uniform_int_distribution<int> m_rndUniInt;
+    using UniIntT = std::uniform_int_distribution<int>;
+    // REVIEW alexeyk: currently not thread safe. Engines are expensive to create.
+    std::mt19937 m_rng;

    int m_dataType;

@ -229,7 +283,7 @@ private:
 // ImageReader

 template<class ElemType>
-ImageReader<ElemType>::ImageReader() : m_seed(0), m_rng(m_seed), m_rndUniInt(0, INT_MAX)
+ImageReader<ElemType>::ImageReader() : m_seed(0), m_rng(m_seed)
 {
    m_transforms.push_back(std::make_unique<CropTransform>(m_seed));
    m_transforms.push_back(std::make_unique<ScaleTransform>(sizeof(ElemType) == 4 ? CV_32F : CV_64F, m_seed));
@ -287,8 +341,6 @@ void ImageReader<ElemType>::Init(const ConfigParameters& config)
        files.push_back({ imgPath, std::stoi(clsId) });
    }

-    std::shuffle(files.begin(), files.end(), m_rng);
-
    m_epochStart = 0;
    m_mbStart = 0;
 }
@ -304,6 +356,8 @@ void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size
    assert(mbSize > 0);
    assert(requestedEpochSamples > 0);

+    std::shuffle(files.begin(), files.end(), m_rng);
+
    m_epochSize = (requestedEpochSamples == requestDataSize ? files.size() : requestedEpochSamples);
    m_mbSize = mbSize;
    // REVIEW alexeyk: if user provides epoch size explicitly then we assume epoch size is a multiple of mbsize, is this ok?
@ -339,7 +393,7 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>

    std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
    
-#pragma omp parallel for ordered schedule(dynamic)
+//#pragma omp parallel for ordered schedule(dynamic)
    for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
    {
        const auto& p = files[i + m_mbStart];
--- a/DataReader/ImageReader/ImageReader.h
+++ b/DataReader/ImageReader/ImageReader.h
@ -39,8 +39,7 @@ public:

 private:
    unsigned int m_seed;
-    std::default_random_engine m_rng;
-    std::uniform_int_distribution<int> m_rndUniInt;
+    std::mt19937 m_rng;

    std::vector<std::unique_ptr<ITransform>> m_transforms;

--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@ -1,141 +1,150 @@
-//
-// <copyright file="EvaluationCriterionNodes.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-#include <stdexcept>
-#include <list>
-#include <memory>
-#include "ComputationNode.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-    //note: to save computation the gradient may be scaled by an constant. 
-
-    // -----------------------------------------------------------------------
-    // ErrorPredictionNode (label, prediction)    --TODO: is that correct?
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class ErrorPredictionNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>, public NumInputs<2>
-    {
-        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"ErrorPrediction"; }
-    public:
-        ErrorPredictionNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name),
+//
+// <copyright file="EvaluationCriterionNodes.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <list>
+#include <memory>
+#include "ComputationNode.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+    //note: to save computation the gradient may be scaled by an constant. 
+
+    // -----------------------------------------------------------------------
+    // ErrorPredictionNode (label, prediction)    --TODO: is that correct?
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ErrorPredictionNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>
+    {
+        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        static const std::wstring TypeName() { return L"ErrorPrediction"; }
+    public:
+        ErrorPredictionNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name),
            m_maxIndexes0(deviceId), m_maxIndexes1(deviceId), m_maxValues(deviceId)
-        { }
-
-        void Reset()        // TODO: what is this??
-        {
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/)  //scaled by 2*number of elements in the Matrix<ElemType>
-        {
-            LogicError("ErrorPrediction is used for evaluation only.");
-        }
-
-        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
-        {
-            EvaluateThisNodeS(m_functionValues, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_maxIndexes0, m_maxIndexes1, m_maxValues, shared_from_this());
-        }
-
-        void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& maxIndexes0, Matrix<ElemType>& maxIndexes1, Matrix<ElemType>& maxValues, ComputationNodePtr curNode)
-        {
-            inputFunctionValues0.VectorMax(maxIndexes0, maxValues, true);
-            inputFunctionValues1.VectorMax(maxIndexes1, maxValues, true);
-            curNode->MaskMissingColumnsToZero(maxIndexes0, Inputs(0)->GetMBLayout());   // we are fine since it will only be called with full minibatch
-            curNode->MaskMissingColumnsToZero(maxIndexes1, Inputs(1)->GetMBLayout());
-            functionValues.AssignNumOfDiff(maxIndexes0, maxIndexes1);
-        #if NANCHECK
-            functionValues.HasNan("ErrorPrediction");
-        #endif
-#if DUMPOUTPUT
-            functionValues.Print("ErrorPredictionNode");
-#endif
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-
-            size_t index = 0;
-            {
-                size_t rows = Inputs(index)->GetNumRows() == 0? Inputs(1-index)->GetNumRows() : Inputs(index)->GetNumRows();
-                size_t cols = Inputs(index)->GetNumCols() == 0? Inputs(1-index)->GetNumCols() : Inputs(index)->GetNumCols();
-                ValidateInferChildDims(index, rows, cols);
-            }
-
-            index = 1;
-            {
-                size_t rows = Inputs(index)->GetNumRows() == 0? Inputs(1-index)->GetNumRows() : Inputs(index)->GetNumRows();
-                size_t cols = Inputs(index)->GetNumCols() == 0? Inputs(1-index)->GetNumCols() : Inputs(index)->GetNumCols();
-                ValidateInferChildDims(index, rows, cols);
-                m_maxIndexes0.Resize(1,cols);
-                m_maxIndexes1.Resize(1,cols);
-                m_maxValues.Resize(1,cols);
-            }
-
-            //if (Inputs(0)->GetNumRows() == 0 || Inputs(1)->GetNumRows() == 0)
-            //    LogicError("ErrorPrediction operation: one of the operands has 0 elements.");
-
-            if (isFinalValidationPass)
-                if (!(Inputs(0)->GetNumRows() == Inputs(1)->GetNumRows() && Inputs(0)->GetNumCols() == Inputs(1)->GetNumCols()))
-                {
-                    LogicError("The Matrix dimension in the ErrorPrediction operation does not match.");
-                }       
-
-            Resize(1,1);
-            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
-            InferImageDimsFromInputs(); 
-
-            // resize the temporaries to their proper size
-            size_t cols = Inputs(0)->GetNumCols();
-            m_maxIndexes0.Resize(1,cols);
-            m_maxIndexes1.Resize(1,cols);
-            m_maxValues.Resize(1,cols);
-        }
-
-        virtual void InferImageDimsFromInputs()
-        {
-            InferImageDimsFromInput(0, false);
-
-            m_outputImageLayout = ImageLayout();
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            Base::MoveMatricesToDevice(deviceId);
-            m_maxIndexes0.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
-            m_maxIndexes1.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
-            m_maxValues.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = dynamic_pointer_cast<ErrorPredictionNode<ElemType>>(nodeP);
-                node->m_maxIndexes0 = m_maxIndexes0;
-                node->m_maxIndexes1 = m_maxIndexes1;
-                node->m_maxValues = m_maxValues;
-            }
+        { }
+
+        void Reset()        // TODO: what is this??
+        {
        }
-protected:
-        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
-
-    private:
-        Matrix<ElemType> m_maxIndexes0, m_maxIndexes1;
-        Matrix<ElemType> m_maxValues;
-    };
-
-    template class ErrorPredictionNode<float>; 
-    template class ErrorPredictionNode<double>;
-
-}}}
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/)  //scaled by 2*number of elements in the Matrix<ElemType>
+        {
+            LogicError("ErrorPrediction is used for evaluation only.");
+        }
+
+        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
+        {
+            EvaluateThisNodeS(m_functionValues, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_maxIndexes0, m_maxIndexes1, m_maxValues, m_topK, shared_from_this());
+        }
+
+        void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& maxIndexes0, Matrix<ElemType>& maxIndexes1, Matrix<ElemType>& maxValues, ComputationNodePtr curNode)
+        {
+            inputFunctionValues0.VectorMax(maxIndexes0, maxValues, true);
+            inputFunctionValues1.VectorMax(maxIndexes1, maxValues, true, topK);
+            curNode->MaskMissingColumnsToZero(maxIndexes0, Inputs(0)->GetMBLayout());   // we are fine since it will only be called with full minibatch
+            curNode->MaskMissingColumnsToZero(maxIndexes1, Inputs(1)->GetMBLayout());
+            functionValues.AssignNumOfDiff(maxIndexes0, maxIndexes1, topK > 1);
+        #if NANCHECK
+            functionValues.HasNan("ErrorPrediction");
+        #endif
+#if DUMPOUTPUT
+            functionValues.Print("ErrorPredictionNode");
+#endif
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            Base::Validate(isFinalValidationPass);
+
+            size_t index = 0;
+            {
+                size_t rows = Inputs(index)->GetNumRows() == 0? Inputs(1-index)->GetNumRows() : Inputs(index)->GetNumRows();
+                size_t cols = Inputs(index)->GetNumCols() == 0? Inputs(1-index)->GetNumCols() : Inputs(index)->GetNumCols();
+                ValidateInferChildDims(index, rows, cols);
+            }
+
+            index = 1;
+            {
+                size_t rows = Inputs(index)->GetNumRows() == 0? Inputs(1-index)->GetNumRows() : Inputs(index)->GetNumRows();
+                size_t cols = Inputs(index)->GetNumCols() == 0? Inputs(1-index)->GetNumCols() : Inputs(index)->GetNumCols();
+                ValidateInferChildDims(index, rows, cols);
+                m_maxIndexes0.Resize(1,cols);
+                m_maxIndexes1.Resize(1,cols);
+                m_maxValues.Resize(1,cols);
+            }
+
+            m_topK = 1;
+            if (m_children.size() == 3)
+            {
+                if (Inputs(2)->FunctionValues().GetNumRows() != 1 || Inputs(2)->FunctionValues().GetNumCols() != 1)
+                    throw std::logic_error("TopK in ErrorPredictionNode must be a scalar value.");
+                m_topK = static_cast<int>(Inputs(2)->FunctionValues().Get00Element());
+            }
+
+            //if (Inputs(0)->GetNumRows() == 0 || Inputs(1)->GetNumRows() == 0)
+            //    LogicError("ErrorPrediction operation: one of the operands has 0 elements.");
+
+            if (isFinalValidationPass)
+                if (!(Inputs(0)->GetNumRows() == Inputs(1)->GetNumRows() && Inputs(0)->GetNumCols() == Inputs(1)->GetNumCols()))
+                {
+                    LogicError("The Matrix dimension in the ErrorPrediction operation does not match.");
+                }       
+
+            Resize(1,1);
+            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
+            InferImageDimsFromInputs(); 
+
+            // resize the temporaries to their proper size
+            size_t cols = Inputs(0)->GetNumCols();
+            m_maxIndexes0.Resize(m_topK,cols);
+            m_maxIndexes1.Resize(m_topK,cols);
+            m_maxValues.Resize(m_topK,cols);
+        }
+
+        virtual void InferImageDimsFromInputs()
+        {
+            InferImageDimsFromInput(0, false);
+
+            m_outputImageLayout = ImageLayout();
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            Base::MoveMatricesToDevice(deviceId);
+            m_maxIndexes0.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
+            m_maxIndexes1.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
+            m_maxValues.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            Base::CopyTo(nodeP, newName, flags);
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                auto node = dynamic_pointer_cast<ErrorPredictionNode<ElemType>>(nodeP);
+                node->m_maxIndexes0 = m_maxIndexes0;
+                node->m_maxIndexes1 = m_maxIndexes1;
+                node->m_maxValues = m_maxValues;
+            }
+        }
+protected:
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
+
+    private:
+        Matrix<ElemType> m_maxIndexes0, m_maxIndexes1;
+        Matrix<ElemType> m_maxValues;
+        int m_topK;
+    };
+
+    template class ErrorPredictionNode<float>; 
+    template class ErrorPredictionNode<double>;
+
+}}}
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -20,6 +20,7 @@
 #include <exception>
 #include <thread>
 #include<iostream>
+#include <algorithm>
 #ifdef     _WIN32
 #include <Windows.h>
 #else
@ -3301,7 +3302,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }
    //I decided to use CPUMatrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
    template<class ElemType>
-    void CPUMatrix<ElemType>::VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise) const
+    void CPUMatrix<ElemType>::VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const
    {
        if (IsEmpty())
            LogicError("VectorMax: Matrix is empty.");
@ -3309,33 +3310,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;
        const int m = (int)GetNumRows();
        const int n = (int)GetNumCols();
+        assert(topK <= m);

        assert (m>0 && n>0); //converting from size_t to int may cause overflow

        if (isColWise)  //col-wise
        {
-            maxValues.Resize(1, n);
-            maxIndexes.Resize(1, n);
+            maxValues.Resize(topK, n);
+            maxIndexes.Resize(topK, n);

-#pragma omp parallel for
-            for (int j=0; j<n; j++)
+            if (topK == 1)
            {
-                ElemType v = us(0, j);
-                size_t index = 0;
-                foreach_row(i,us)
+#pragma omp parallel for
+                for (int j = 0; j < n; j++)
                {
-                    if (v < us(i,j))
+                    ElemType v = us(0, j);
+                    size_t index = 0;
+                    foreach_row(i, us)
                    {
-                        index = i;
-                        v = us(i,j);
+                        if (v < us(i, j))
+                        {
+                            index = i;
+                            v = us(i, j);
+                        }
+                    }
+                    maxValues(0, j) = v;
+                    maxIndexes(0, j) = (ElemType)index;
+                }
+            }
+            else
+            {
+                std::vector<int> indices(m);
+                int i = 0;
+                std::generate(indices.begin(), indices.end(), [&i] { return i++; });
+
+                const ElemType* curVal = m_pArray;
+                ElemType* curIdx = maxIndexes.m_pArray;
+                ElemType* curMax = maxValues.m_pArray;
+                for (int icol = 0; icol < n; icol++, curVal += m, curIdx += topK, curMax += topK)
+                {
+                    // Partial sort, descending order.
+                    std::nth_element(indices.begin(), indices.begin() + topK, indices.end(),
+                        [curVal](const int& a, const int& b) { return curVal[a] > curVal[b]; });
+                    // REVIEW alexeyk: the following produces warning (see SCL_SECURE_NO_WARNINGS) so use loop instead.
+                    //std::transform(indices.begin(), indices.begin() + topK, curIdx, [](const int& a) { return static_cast<ElemType>(a); });
+                    for (int i = 0; i < topK; i++)
+                    {
+                        curIdx[i] = static_cast<ElemType>(indices[i]);
+                        curMax[i] = curVal[indices[i]];
                    }
                }
-                maxValues(0,j) = v;                
-                maxIndexes(0,j) = (ElemType)index;                
            }
        }
        else
        {
+            if (topK > 1)
+                RuntimeError("Row-wise TopK max is not supported.");
+
            maxValues.Resize(m,1);
            maxIndexes.Resize(m, 1);

@ -3420,19 +3451,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    CPUMatrix<ElemType>&  CPUMatrix<ElemType>::AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b)
+    CPUMatrix<ElemType>&  CPUMatrix<ElemType>::AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, bool searchInCol)
    {
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            InvalidArgument("AssignNumOfDiff: a and b must have same dimension.");
-        
+        if (a.GetNumCols() != b.GetNumCols())
+            throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of columns.");
+        if (!searchInCol && a.GetNumRows() != b.GetNumRows())
+            throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of rows.");
+
        ElemType n = 0;
-        foreach_coord(i,j,a)
+        if (!searchInCol)
        {
-            n += (a(i,j) != b(i,j));
+            foreach_coord(i, j, a)
+            {
+                n += (a(i, j) != b(i, j));
+            }
+        }
+        else
+        {
+            size_t crow = b.GetNumRows();
+            const ElemType* curCol = b.m_pArray;
+            for (size_t icol = 0; icol < a.GetNumCols(); icol++, curCol += crow)
+            {
+                auto res = std::find(curCol, curCol + crow, a(0, icol));
+                if (res == curCol + crow)
+                    n++;
+            }
        }

-        Resize(1,1); //result should be one element
-        (*this)(0,0) = n;
+        Resize(1, 1); //result should be one element
+        (*this)(0, 0) = n;

        return *this;
    }
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -252,10 +252,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
        CPUMatrix<ElemType>&  AddFoldedPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
        
-        void VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise) const;
+        void VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise, int topK = 1) const;
        void VectorMin(CPUMatrix<ElemType>& mainndexes, CPUMatrix<ElemType>& minValues, const bool isColWise) const;

-        CPUMatrix<ElemType>&  AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b); 
+        CPUMatrix<ElemType>&  AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, bool searchInCol = false);

        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -2457,7 +2457,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    Matrix<ElemType>& Matrix<ElemType>::AssignNumOfDiff (const Matrix<ElemType>& a, const Matrix<ElemType>& b)
+    Matrix<ElemType>& Matrix<ElemType>::AssignNumOfDiff (const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol)
    {
        DecideAndMoveToRightDevice(a, b, *this);        
        //WARNING: a and b must have same type
@ -2468,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        DISPATCH_MATRIX_ON_FLAG(this,
            this,
-            this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix), 
+            this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), 
            this->m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix), 
            NOT_IMPLEMENTED, 
            NOT_IMPLEMENTED
@ -3380,7 +3380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    //I decided to use Matrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
    template<class ElemType>
-    void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const
+    void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
    {
        if (IsEmpty())
            LogicError("VectorMax: Matrix is empty.");
@ -3391,7 +3391,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        DISPATCH_MATRIX_ON_FLAG(this,
            &maxValues,
-            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix,*maxValues.m_CPUMatrix,isColWise); maxIndexes.SetDataLocation(CPU, DENSE), 
+            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix,*maxValues.m_CPUMatrix,isColWise,topK); maxIndexes.SetDataLocation(CPU, DENSE), 
            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix,*maxValues.m_GPUMatrix,isColWise); maxIndexes.SetDataLocation(GPU, DENSE), 
            NOT_IMPLEMENTED, 
            NOT_IMPLEMENTED
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -367,10 +367,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ElemType MatrixNorm0() const; //number of non-zero elemets
        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
-        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
+        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK = 1) const;
        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;

-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
+        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 

        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first