Merge remote-tracking branch 'origin/master' into linux-gcc

Conflicts: Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx MachineLearning/CNTK/ComputationNetwork.h MachineLearning/CNTK/LinearAlgebraNodes.h MachineLearning/CNTK/NetworkDescriptionLanguage.cpp Makefile_kaldi2.cpu Makefile_kaldi2.gpu
2015-07-05 22:28:21 -07:00 · 2015-07-05 22:28:21 -07:00 · f7f902107f
--- a/Documentation/CNTK-TechReport/lyx/#CNTKBook_CNTK_Programmer_Chapter.lyx#
+++ b/Documentation/CNTK-TechReport/lyx/#CNTKBook_CNTK_Programmer_Chapter.lyx#
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook-20150702.pdf
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook-20150702.pdf
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@ -2290,6 +2290,50 @@ m1, m2 - input matrices.
 The m1.cols must equal m2.rows.
 \end_layout

+\begin_layout Subsubsection
+TransposeTimes
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+TransposeTimes
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Calculate the product 
+\begin_inset Formula $m1^{T}m2$
+\end_inset
+
+.
+ The resulting matrix has a size of m1.cols by m2.cols.
+ The syntax is
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+Times(m1, m2)
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Itemize
+m1, m2 - input matrices.
+ The m1.rows must equal m2.rows.
+\end_layout
+
 \begin_layout Subsubsection
 DiagTimes
 \begin_inset Index idx
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@ -235,7 +235,7 @@ public:
        std::vector<ComputationNodePtr> DelayNodes;
        for (auto n : allnodes)
        {
-            if (n->OperationName() == L"Delay")
+            if (n->OperationName() == DelayNode<ElemType>::TypeName())
            {
                DelayNodes.push_back(n);
            }
@ -245,7 +245,7 @@ public:
        std::vector<ComputationNodePtr> learnableParameters;
        for (auto n : allnodes)
        {
-            if (n->OperationName() == L"LearnableParameter")
+            if (n->OperationName() == LearnableParameter<ElemType>::TypeName())
            {
                learnableParameters.push_back(n);
            }
@ -348,7 +348,7 @@ public:
            std::wstring srcname = src->GetName();
            std::wstring desname = des->GetName();

-            if (des->OperationName() == L"Delay")
+            if (des->OperationName() == DelayNode<ElemType>::TypeName())
            {
                // special treament for arc with Delay node as the children
                // create a dummy node
@ -1181,6 +1181,10 @@ public:
        {
            newNode = new TimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
        }
+        else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
+        {
+            newNode = new TransposeTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
+        }
        else if (nodeType == ElementTimesNode<ElemType>::TypeName())
        {
            newNode = new ElementTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
@ -1481,6 +1485,10 @@ public:
        {
            newNode = new TimesNode<ElemType>(m_deviceId, nodeName);
        }
+        else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
+        {
+            newNode = new TransposeTimesNode<ElemType>(m_deviceId, nodeName);
+        }
        else if (nodeType == ElementTimesNode<ElemType>::TypeName())
        {
            newNode = new ElementTimesNode<ElemType>(m_deviceId, nodeName);
@ -1953,6 +1961,16 @@ public:
        return newNode;
    }

+    ComputationNodePtr TransposeTimes(const ComputationNodePtr a,
+        const ComputationNodePtr b,
+        const std::wstring nodeName = L"")
+    {
+        ComputationNodePtr newNode(new TransposeTimesNode<ElemType>(m_deviceId, nodeName));
+        newNode->AttachInputs(a, b);
+        AddNodeToNet(newNode);
+        return newNode;
+    }
+
    ComputationNodePtr ElementTimes(const ComputationNodePtr a,
                                    const ComputationNodePtr b,
                                    const std::wstring nodeName = L"")
@ -2217,7 +2235,7 @@ public:
    {
        for (auto ptr = recurrentNodes.begin(); ptr != recurrentNodes.end(); ptr++)
        {
-            if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != L"Delay") {
+            if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != DelayNode<ElemType>::TypeName()) {
                return true;
            }
        }
@ -3359,7 +3377,7 @@ protected:
            visited.insert(cur);
            recStack.insert(cur);

-            if (cur->OperationName() != L"Delay")
+            if (cur->OperationName() != DelayNode<ElemType>::TypeName())
            {
                for (size_t i = 0; i < cur->ChildrenSize(); i++)
                {
@ -3442,7 +3460,7 @@ protected:
                    ComputationNodePtr nodeRecIter = (*iter).m_recurrentNodes[j];
                    for (size_t i = 0; i < nodeRecIter->ChildrenSize(); i++)
                    {
-                        if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != L"Delay"))
+                        if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != DelayNode<ElemType>::TypeName()))
                        {
                            nodeRecIter->Inputs(i)->SetIndexInLoop(nodeRecIter->Inputs(i)->GetIndexInLoop() + 1);
                        }
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@ -51,7 +51,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    };

 #pragma region base computation class
-
    template<class ElemType>
    class ComputationNode //Abstract Class that cannot be instantiated
    {
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@ -887,6 +887,193 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class TimesNode<float>; 
    template class TimesNode<double>;

+    template<class ElemType>
+    class TransposeTimesNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        TransposeTimesNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        TransposeTimesNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        // copy constructor
+        TransposeTimesNode(const TransposeTimesNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"") ? NodeName() : newName;
+
+            ComputationNodePtr node = new TransposeTimesNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+        virtual const std::wstring OperationName() const { return TypeName(); }
+        static const std::wstring TypeName() { return L"TransposeTimes"; }
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
+
+            if (inputIndex == 0)  //left derivative
+            {
+                ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
+            }
+            else  //right derivative
+            {
+                ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
+            }
+        }
+
+        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
+        {
+            if (inputIndex > 1)
+                throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
+
+            if (inputIndex == 0)  //left derivative
+            {
+                Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+                ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
+            }
+            else  //right derivative
+            {
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+                ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
+            }
+        }
+
+        static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
+        {
+#if DUMPOUTPUT
+            gradientValues.Print("Gradient-in");
+            inputGradientValues.Print("child Gradient-in/out");
+            inputFunctionValues.Print("child Function values");
+#endif
+            //currently we only support one combination when the input is sparse.
+            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
+
+            Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
+
+
+#if DUMPOUTPUT
+            inputGradientValues.Print("child Gradient-out");
+#endif
+        }
+
+        static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
+        {
+#if DUMPOUTPUT
+            gradientValues.Print("Gradient-in");
+            inputGradientValues.Print("child Gradient-in/out");
+            inputFunctionValues.Print("child Function values");
+#endif
+            Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, false, inputGradientValues);
+
+#if DUMPOUTPUT
+            inputGradientValues.Print("child Gradient-out");
+#endif
+        }
+
+
+        virtual void EvaluateThisNode()
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues());
+        }
+
+        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
+        {
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+            EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
+        {
+#if DUMPOUTPUT
+            input0.Print("TransposeTimesNode - Input0");
+#endif
+            functionValues.AssignProductOf(input0, true, input1, false);
+#if NANCHECK
+            functionValues.HasNan("TransposeTimes");
+#endif
+#if DUMPOUTPUT
+            functionValues.Print("TransposeTimes");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 2)
+                throw std::logic_error("TransposeTimes operation requires two inputs.");
+
+            //support automatic dimention inference for learnable parameters
+            size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
+            size_t rows1 = Inputs(1)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
+
+            if ((rows0 == 0 || cols1 == 0) && this->LoopId() < 0)
+                throw logic_error("TransposeTimes operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");
+
+            if ((Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
+                Inputs(0)->FunctionValues().Resize(rows0, rows1);
+
+            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && rows1 == 0)
+                Inputs(1)->FunctionValues().Resize(cols0, cols1);
+
+            if ((Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) && this->LoopId() < 0)
+                throw std::logic_error("TransposeTimes operation: One of the operants has 0 elements.");
+
+            //cols0 and rows1 may have been changed so don't use them in the following check
+            if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumRows()) && this->LoopId() < 0)
+            {
+                throw std::logic_error("The Matrix dimension in the TransposeTimes operation does not match.");
+            }
+            FunctionValues().Resize(cols0, cols1);
+            CopyImageSizeFromInputs();
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(1, false); //the second one is the input since it's column wize
+
+            //after multiplication the structure is lost
+            m_outputWidth = 1;
+            m_outputHeight = Inputs(0)->FunctionValues().GetNumRows();
+            m_outputChannels = 1;
+        }
+
+
+        virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode)
+        {
+            m_children.resize(2);
+            m_children[0] = leftNode;
+            m_children[1] = rightNode;
+        }
+    };
+
+    template class TransposeTimesNode<float>;
+    template class TransposeTimesNode<double>;
+
    template<class ElemType>
    class ElementTimesNode : public ComputationNode<ElemType>
    {
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
@ -158,6 +158,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
        ret = true;
    else if (EqualInsensitive(nodeType, TimesNode<ElemType>::TypeName()))
        ret = true;
+    else if (EqualInsensitive(nodeType, TransposeTimesNode<ElemType>::TypeName()))
+        ret = true;
    else if (EqualInsensitive(nodeType, ElementTimesNode<ElemType>::TypeName()))
        ret = true;
    else if (EqualInsensitive(nodeType, DiagTimesNode<ElemType>::TypeName()))
--- a/MachineLearning/CNTK/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTK/TrainingCriterionNodes.h
@ -982,13 +982,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                // evaluation uses softmax
                m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
-                /*
-#pragma omp parallel for
-                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                for (int j = 0; j < Inputs(3)->FunctionValues().GetNumRows(); j++)
-                    m_logSoftmax(i, j) += Inputs(3)->FunctionValues()(j, 0);
-                */
-                m_logSoftmax += Inputs(3)->FunctionValues().Transpose();
+                m_logSoftmax += Inputs(3)->FunctionValues();
                m_logSoftmax.InplaceLogSoftmax(false);
                FunctionValues().Resize(1, 1);
                FunctionValues().SetValue(0);
--- a/5
+++ b/5
@ -37,6 +37,7 @@ CUDA_PATH = /usr/local/cuda-7.0

 # This is a suggested/default location for NVML
 NVML_INCLUDE = /usr/include/nvidia/gdk
+NVML_LIB = /usr/src/gdk/nvml/lib
 #######

 BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
@ -50,7 +51,7 @@ ifeq ($(BUILDTYPE),debug)
 	GPU_BUILDTYPE_OPT = -O0 -G -lineinfo
 else
 	BUILDTYPE_OPT = -O3 -flto
-	GPU_BUILDTYPE_OPT = -O3 -use_fast_math
+	GPU_BUILDTYPE_OPT = -O3 -use_fast_math -lineinfo
 endif

 # Set up math library defines and libraries
@ -66,7 +67,7 @@ endif

 # Set up CUDA includes and libraries
 CUDA_INCLUDE = $(CUDA_PATH)/include
-CUDA_LIB = -L$(CUDA_PATH)/lib64 -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml
+CUDA_LIB = -L$(CUDA_PATH)/lib64 -L$(NVML_LIB) -lcublas -lcudart -lcurand -lcusparse -lnvidia-ml

 # Set up final list of libs to use
 ifeq ($(DEVICE),gpu)
--- a/Makefile_kaldi.cpu
+++ b/Makefile_kaldi.cpu
@ -78,7 +78,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
 MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
 CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
          MachineLearning/CNTK/ModelEditLanguage.cpp \
-          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
+		  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
--- a/Makefile_kaldi.gpu
+++ b/Makefile_kaldi.gpu
@ -88,7 +88,8 @@ MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKe
 		   Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
 CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
          MachineLearning/CNTK/ModelEditLanguage.cpp \
-          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp  \
+		  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
--- a/Makefile_kaldi2.cpu
+++ b/Makefile_kaldi2.cpu
@ -88,7 +88,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
 MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
 CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
          MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
-          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
+          MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
--- a/Makefile_kaldi2.gpu
+++ b/Makefile_kaldi2.gpu
@ -48,8 +48,8 @@ DEVICE = gpu
 #BUILDTYPE = debug
 BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
-MATHLIB = mkl
+MATHLIB = acml
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/yzhang87/tools/composer_xe_2015.2.164
 ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
@ -99,8 +99,9 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
 MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKernels.cu Math/Math/GPUSparseMatrix.cu Math/Math/GPUWatcher.cu \
 		   Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
 CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
-          MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
-          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+          MachineLearning/CNTK/ModelEditLanguage.cpp \
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
+          MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
--- a/Math/CNTKMathTest/CPUSparseMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/CPUSparseMatrixUnitTests.cpp
@ -38,7 +38,7 @@ namespace CNTKMathTest
            }

            DenseMatrix DM1 = DM0.ColumnSlice(start, numCols);
-            DenseMatrix DM2 = SM0.ColumnSlice(start, numCols);
+            DenseMatrix DM2 = SM0.ColumnSliceToDense(start, numCols);

            Assert::IsTrue(DM1.IsEqualTo(DM2, 0.0001));
        }
--- a/Math/CNTKMathTest/GPUSparseMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/GPUSparseMatrixUnitTests.cpp
@ -410,11 +410,11 @@ namespace CNTKMathTest
            SM0.SetValue(M0);

            GPUMatrix<float> M1 = M0.ColumnSlice(0, 2);
-            GPUMatrix<float> SM1 = SM0.ColumnSlice(0, 2);
+            GPUMatrix<float> SM1 = SM0.ColumnSliceToDense(0, 2);
            Assert::IsTrue(M1.IsEqualTo(SM1, 0.0001f));

            GPUMatrix<float> M2 = M0.ColumnSlice(1, 2);
-            GPUMatrix<float> SM2 = SM0.ColumnSlice(1, 2);
+            GPUMatrix<float> SM2 = SM0.ColumnSliceToDense(1, 2);
            Assert::IsTrue(M2.IsEqualTo(SM2, 0.0001f));

            Assert::IsFalse(M1.IsEqualTo(SM2, 0.0001f));
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -3918,7 +3918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (int instance_id = 0; instance_id < batch_size; instance_id++)
            for (int sample_id = 0; sample_id < sample_size; sample_id++)
            {
-                int sample =(int) (*this)(2 * sample_id, instance_id);
+                int sample = (int)(*this)(2 * sample_id, instance_id);
                for (int dim = 0; dim < b.GetNumRows(); dim++)
                    c(dim, instance_id) -= b(dim, sample)* tmp(sample_id, instance_id);
            }
@ -3961,16 +3961,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // a:   hidden
        // b:   embedding
        // tmp:  softmax
-        //  c: loglikelihood
+        // c: loglikelihood
    {
-        /*z
-        for (int i = 0; i < (*this).GetNumRows(); i++)
-        {
-            for (int j = 0; j < (*this).GetNumCols(); j++)
-                std::cerr << (*this)(i, j) << " ";
-            std::cerr << endl;
-        }
-        */
        double log_likelihood = 0.0;
        size_t sample_size = this->GetNumRows() / 2;
        size_t batch_size = this->GetNumCols();
@ -3980,7 +3972,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        for (int instance_id = 0; instance_id < batch_size; instance_id++)
        for (int sample_id = 0; sample_id < sample_size; sample_id++)
        {
-            int sample =(int) (*this)(2 * sample_id, instance_id);
+            int sample = (int)(*this)(2 * sample_id, instance_id);
            double score = bias(0, sample);
            for (int dim = 0; dim < b.GetNumRows(); dim++)
                score += a(dim, instance_id)* b(dim, sample);
@ -3994,9 +3986,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
            if (sample_id == 0)
                tmp(sample_id, instance_id) += 1;
-            log_likelihood += sample_id == 0 ? logprob : logprob_noise; 
+            log_likelihood += sample_id == 0 ? logprob : logprob_noise;
        }
-
        c(0, 0) = (ElemType)-log_likelihood;
    }

--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -1895,15 +1895,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        UNCONST(ElemType, a, my_a);
        UNCONST(ElemType, b, my_b);
        UNCONST(ElemType, bias, my_bias);
-
        cudaEvent_t done = nullptr;
        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
        //a: dim * minibatch
        //b: dim * |vocab|
        int p = 512;
        int width = a.GetNumRows(); //dimension of hidden vector
-        //int width = a.GetNumCols(); original setup, considering column-major
-        //
+        
        while (p / 2 > width) p = p / 2;

        _computeNceOutput<ElemType> << <this->GetNumElements() / 2, p >> >(
@ -1915,28 +1913,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            my_b.GetArray(),//b
            my_bias.GetArray(),
            tmp.GetArray());//tmp
-
+     
        p = 512;
        while (p / 2 > this->GetNumElements() / 2) p = p / 2;
-
        // summing up objective must be done in one block
        _assignNoiseContrastiveEstimation<ElemType> << <1, p >> >(
            this->GetArray(),
            sampleCount,
            m_numRows / 2,
-             my_a.GetArray(),
+            my_a.GetArray(),
            a.GetNumCols(),
            my_b.GetArray(),
            tmp.GetArray(),
            c.GetArray());
-
-        _computeNceError<ElemType> << <1, p >> >(
-            this->GetArray(),
-            m_numRows / 2,
-            tmp.GetNumCols(),
-            tmp.GetArray());
-
-        cerr << "log-likelihood:" << Get00Element() << endl;
+      
        if (do_sync) CUDA_CALL(cudaEventRecord(done));
        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1953,8 +1943,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int p = 512;
        int width = a.GetNumRows();
        while (p / 2 > width) p = p / 2;
-
-        _assignNceDerivative<ElemType> << <m_nz, p >> >(
+        _assignNceDerivative<ElemType> << <this->GetNumElements() / 2, p >> >(
            GetArray(),
            tmp.GetNumCols(),
            m_numRows / 2,
@ -1964,7 +1953,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            tmp.GetArray(),
            c.GetArray(),
            inputIndex);
-
        if (do_sync) CUDA_CALL(cudaEventRecord(done));
        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2928,80 +2928,10 @@ __global__ void _computeNceOutput(
        }

        if (threadIdx.x == 0)
-            res[i] = partials[0];
+            res[i] = partials[0] + bias[wid];
    }
 }

-template<class ElemType>
-__global__ void _computeNceOutput(
-    const ElemType* val,
-    const int* col,
-    int numRows,
-    int sampleCount,
-    const ElemType* a,
-    int numCols_a,
-    const ElemType* b,
-    ElemType* res)
-{
-    // val and col are in CSR format
-    // val is an array contains log_Pn(w). To differentiate positive and negative samples, 
-    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
-    // col is an array contains index of the word samples
-    // a is a matrix in column major format contains output from hidden layer
-    // b is the weight matrix for output layer
-    // res is the buffer to store computed output (sparse)
-
-    // follow the convention, this kernel must be run on 512 threads per block
-    __shared__ ElemType partials[512];
-    partials[threadIdx.x] = 0;
-
-    //threadIdx.x range from[0 ~ 512)
-    //blockIdx.x range from[0 ~ nnz)
-    //blockDim.x equal to 512
-    //gridDim.x equal to nnz
-
-    // determine the elements to be handled by this block
-    int total = numRows * sampleCount;
-    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
-
-    int start = loadPerBlock * blockIdx.x;
-    int end = min(total, loadPerBlock * (blockIdx.x + 1));
-
-    for (int i = start; i < end; i++)
-    {
-        int colIndex = col[i];
-        int rowIndex = i / sampleCount;
-
-        int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
-        int tstart = loadPerThread * threadIdx.x;
-        int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));
-
-        for (int j = tstart; j < tend; j++)
-            partials[threadIdx.x] = a[IDX2C(rowIndex, j, numRows)] * b[IDX2C(j, colIndex, numCols_a)];
-
-        __syncthreads();
-
-        // sum up
-        int nTotalThreads = blockDim.x;
-
-        while (nTotalThreads >1)
-        {
-            int halfPoint = (nTotalThreads >> 1);
-
-            if (threadIdx.x < halfPoint)
-                partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
-
-            __syncthreads();
-
-            nTotalThreads = (nTotalThreads >> 1);
-        }
-
-        if (threadIdx.x == 0)
-            res[i] = partials[0];
-    }
-}
-
-
 template<class ElemType>
 __global__ void _assignNoiseContrastiveEstimation(
    const ElemType* val,
@ -3010,7 +2940,7 @@ __global__ void _assignNoiseContrastiveEstimation(
    const ElemType* a,
    int width, // number of columns in a
    const ElemType* b,
-    const ElemType* tmp,
+    ElemType* tmp,
    ElemType* c) // run on 512 threads per block
 {
    // val and col are in CSR format
@ -3026,32 +2956,30 @@ __global__ void _assignNoiseContrastiveEstimation(
    partials[threadIdx.x] = 0;

    int total = numRows * sampleCount;
-    int loadPerThread = (total + 511) / 512;
+    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;

    // find out the items this thread is responsible for
    int start = loadPerThread * threadIdx.x;
    int end = min(total, loadPerThread * (threadIdx.x + 1));

+    ElemType log_num_noise_samples = log((ElemType)(sampleCount - 1));
    for (int i = start; i < end; i++)
    {
-        //int colIndex = col[i];
-        //int rowIndex = i / sampleCount;
-
-        // add to objective
-        ElemType log_pnw = val[2 * i + 1];
-        if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
-        {
-            ElemType den = tmp[i];
-            logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
-            partials[threadIdx.x] += (tmp[i] - den);
-        }
-        else // negative sample: log(k * pnw / (pw + k * pnw))
-        {
-            ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
-            ElemType den = nom;
-            logadd(den, tmp[i]);
-            partials[threadIdx.x] += (nom - den);
-        }
+        ElemType prob = -val[2 * i + 1];
+        bool positive = (prob > 0);
+        if (positive)
+            prob = -prob;
+        ElemType score_noise = log_num_noise_samples + prob;
+        ElemType z = logadd(tmp[i], score_noise);
+        ElemType logprob = tmp[i] - z;
+        ElemType logprob_noise = score_noise - z;
+        tmp[i] = -exp(logprob);
+        if (positive)
+            tmp[i] += 1;
+        if (positive)
+            partials[threadIdx.x] += logprob;
+        else
+            partials[threadIdx.x] += logprob_noise;
    }

    __syncthreads();
@ -3075,211 +3003,6 @@ __global__ void _assignNoiseContrastiveEstimation(
        c[0] = -partials[0];
 }

-template<class ElemType>
-__global__ void _assignNoiseContrastiveEstimation(
-    const ElemType* val,
-    const int* col,
-    int numRows,
-    int sampleCount,
-    const ElemType* a,
-    int width, // number of columns in a
-    const ElemType* b,
-    const ElemType* tmp,
-    ElemType* c) // run on 512 threads per block
-{
-    // val and col are in CSR format
-    // val is an array contains log_Pn(w). To differentiate positive and negative samples, 
-    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
-    // col is an array contains index of the word samples
-    // a is a matrix in column major format contains output from hidden layer
-    // b is the weight matrix for output layer
-    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
-    // c is the matrix to store objective
-
-    __shared__ ElemType partials[512];
-    partials[threadIdx.x] = 0;
-
-    int total = numRows * sampleCount;
-    int loadPerThread = (total + 511) / 512;
-
-    // find out the items this thread is responsible for
-    int start = loadPerThread * threadIdx.x;
-    int end = min(total, loadPerThread * (threadIdx.x + 1));
-
-    for (int i = start; i < end; i++)
-    {
-        //int colIndex = col[i];
-        //int rowIndex = i / sampleCount;
-
-        // add to objective
-        ElemType log_pnw = val[i];
-        if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
-        {
-            ElemType den = tmp[i];
-            logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
-            partials[threadIdx.x] += (tmp[i] - den);
-        }
-        else // negative sample: log(k * pnw / (pw + k * pnw))
-        {
-            ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
-            ElemType den = nom;
-            logadd(den, tmp[i]);
-            partials[threadIdx.x] += (nom - den);
-        }
-    }
-
-    __syncthreads();
-
-    // now sum up the objective function
-    int nTotalThreads = blockDim.x;
-
-    while (nTotalThreads >1)
-    {
-        int halfPoint = (nTotalThreads >> 1);
-
-        if (threadIdx.x < halfPoint)
-            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
-
-        __syncthreads();
-
-        nTotalThreads = (nTotalThreads >> 1);
-    }
-
-    if (threadIdx.x == 0)
-        c[0] = -partials[0];
-}
-
-template<class ElemType>
-__global__ void _computeNceError(
-    const ElemType* val,
-
-    int numRows,
-    int sampleCount,
-    ElemType* tmp) // run on one block 
-{
-    int total = numRows * sampleCount;
-    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
-
-    // find out the items this thread is responsible for
-    int start = loadPerThread * threadIdx.x;
-    int end = min(total, loadPerThread * (threadIdx.x + 1));
-
-    for (int i = start; i < end; i++)
-    {
-        ElemType ac = tmp[i]; // precalculated NCE output
-        ElemType log_pnw = val[2 * i + 1];
-
-        ElemType er = 0;
-        if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
-        {
-            ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
-            logadd(ac, nom);
-            er = -1 * exp(nom - ac);
-
-        }
-        else // negative sample: pw / (pw + k * pnw);
-        {
-            logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
-            er = exp(tmp[i] - ac);
-        }
-
-        tmp[i] = er;
-    }
-}
-
-template<class ElemType>
-__global__ void _computeNceError(
-    const ElemType* val,
-    const int* col,
-    int numRows,
-    int sampleCount,
-    ElemType* tmp) // run on one block 
-{
-    int total = numRows * sampleCount;
-    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
-
-    // find out the items this thread is responsible for
-    int start = loadPerThread * threadIdx.x;
-    int end = min(total, loadPerThread * (threadIdx.x + 1));
-
-    for (int i = start; i < end; i++)
-    {
-        ElemType ac = tmp[i]; // precalculated NCE output
-        ElemType log_pnw = val[i];
-
-        ElemType er = 0;
-        if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
-        {
-            ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
-            logadd(ac, nom);
-            er = -1 * exp(nom - ac);
-
-        }
-        else // negative sample: pw / (pw + k * pnw);
-        {
-            logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
-            er = exp(tmp[i] - ac);
-        }
-
-        tmp[i] = er;
-    }
-}
-
-template<class ElemType>
-__global__ void _assignNceDerivativeInJbor(
-    const ElemType* val,
-    const int* col,
-    const int* colndx,
-    int numRows,
-    int sampleCount,
-    const ElemType* a,
-    int width, // number of columns in a
-    const ElemType* b,
-    const ElemType* tmp,
-    unsigned char* c,
-    size_t jborBlockSize)
-{
-    // val and col are CSR format sparse matrix for label
-    // val is an array contains log_Pn(w). To differentiate positive and negative samples
-    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
-    // col is an array contains index of the word samples
-    // a is a matrix in column major format contains output from hidden layer
-    // b is the weight matrix for output layer
-    // tmp is a matrix of precalculated error
-    // c is the output array to store intermediate results
-
-
-    /*
-    int total = numRows * sampleCount;
-    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
-
-    // find out the items this block is responsible for
-    int start = loadPerBlock * blockIdx.x;
-    int end = min(total, loadPerBlock * (blockIdx.x + 1));
-
-    for (int i = start; i < end; i++)
-    {
-    int colIndex = col[i];
-    int rowIndex = i / sampleCount;
-
-    ElemType er = tmp[i]; // precalculated error for this output node
-    ElemType log_pnw = val[i];
-
-    // calculate gradients
-    int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
-    int tstart = loadPerThread * threadIdx.x;
-    int tend = min(width, loadPerThread*(threadIdx.x + 1));
-    for (int j = tstart; j < tend; j++)
-    {
-    ElemType val = er * a[IDX2C(rowIndex, j, numRows)];
-
-    unsigned char* pj = c + (j*jborBlockSize + colndx[i]) * sizeof(JborRecord<ElemType>);
-    atomicExch((int*)pj, colIndex);
-    atomicAdd((ElemType*)(pj + sizeof(int)), val);
-    }
-    }*/
-}
-
 template<class ElemType>
 __global__ void _assignNceDerivative(
    const ElemType* val,
@ -3310,12 +3033,11 @@ __global__ void _assignNceDerivative(

    for (int i = start; i < end; i++)
    {
-        int colIndex = (int)val[2 * i];
-        int rowIndex = i / sampleCount;
+        int wid = (int)val[2 * i];
+        int batchId = i / sampleCount;

        ElemType er = tmp[i]; // precalculated error for this output node
-        //ElemType log_pnw = val[2 * i + 1];
-
+      
        // calculate gradients
        int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
        int tstart = loadPerThread * threadIdx.x;
@ -3325,23 +3047,24 @@ __global__ void _assignNceDerivative(
        {
            for (int j = tstart; j < tend; j++)
            {
-                ElemType val = er * b[IDX2C(j, colIndex, width)];
-                atomicAdd(c + IDX2C(j, rowIndex, width), val);
-                //c[IDX2C(rowIndex, j, numRows)] += val;
+                ElemType val = -er * b[IDX2C(j, wid, width)];
+                atomicAdd(&c[IDX2C(j, batchId, width)], val);
+                //c[IDX2C(j, batchId, width)] += val;
+                //c[IDX2C(batchId, j, numRows)] += val;
            }
        }
        else if (inputIndex == 2) // weight
        {
            for (int j = tstart; j < tend; j++)
            {
-                ElemType val = er * a[IDX2C(j, rowIndex, width)];
-                atomicAdd(c + IDX2C(j, colIndex, width), val);
-                //c[IDX2C(j, colIndex, width)] += val;
+                ElemType val = -er * a[IDX2C(j, batchId, width)];
+                atomicAdd(&c[IDX2C(j, wid, width)], val);
+                //c[IDX2C(j, wid, width)] += val;
            }
        }
        else //bias vector
        {
-            c[colIndex] += er;
+            c[wid] -= er;
        }
    }
 }
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -11,6 +11,7 @@
 #include <assert.h>
 #include <math.h>
 #include "GPUWatcher.h"     // bring in this class as well so that it gets exported from this DLL
+#include <iostream>

 #ifndef CPUONLY
 #pragma comment (lib, "CNTKMathCUDA.lib")   // built by CNTKMathCUDA project
@ -3687,8 +3688,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            throw std::logic_error("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");

        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())
+        {
+            std::cerr << a.GetDeviceId() << " " << b.GetDeviceId() << " " << c.GetDeviceId() << " " << this->GetDeviceId() << std::endl;
            NOT_IMPLEMENTED;
-
+        }
+        
        this->Resize(1, 1);

        if (this->GetDeviceId() < 0)
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -232,6 +232,12 @@ namespace Microsoft {

            template<class ElemType> void GPUSparseMatrix<ElemType>::InplaceTranspose() { }

+            template<class ElemType>
+            GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSliceToDense(size_t startColumn, size_t numCols) const
+            {
+                return ElemType(0);
+            }
+
            template<class ElemType> ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
            {
                return ElemType(0);
@ -1053,7 +1059,7 @@ namespace Microsoft {

            template<class ElemType>
            void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemType>& a,
-                const GPUMatrix<ElemType>& b, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
+        const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& bias, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
            {
            }