Merge branch 'master' of https://git01.codeplex.com/cntk

Conflicts: DataReader/HTKMLFReader/HTKMLFReader.cpp DataReader/HTKMLFReader/HTKMLFReader.h
2015-02-06 16:14:47 -08:00 · 2015-02-06 16:14:47 -08:00 · b4f465b8c4
--- a/.gitignore
+++ b/.gitignore
@ -160,3 +160,9 @@ $RECYCLE.BIN/
 *.lyx~
 *.bak
 *.lyx#
+
+# =========================
+# prebuild file 
+# =========================
+MachineLearning/cn/buildinfo.h
+
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@ -5,8 +5,9 @@
 //
 // ConfigFile.cpp : Defines the configuration file loader.
 //
-
+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif

 #include "File.h"
 #include "commandArgUtil.h"
--- a/Common/File.cpp
+++ b/Common/File.cpp
@ -4,7 +4,9 @@
 // </copyright>
 //

+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif

 #include "basetypes.h"
 #define FORMAT_SPECIALIZE // to get the specialized version of the format routines
--- a/Common/Include/TimerUtility.h
+++ b/Common/Include/TimerUtility.h
@ -0,0 +1,13 @@
+#pragma once
+
+#define MS_PER_SEC 1000
+
+namespace Microsoft{namespace MSR {namespace CNTK {
+    class Timer
+    {
+    public:
+        Timer(){};
+        ~Timer(){};
+        static unsigned long long MilliSecondElapsed();
+    };
+}}}
--- a/Common/TimerUtility.cpp
+++ b/Common/TimerUtility.cpp
@ -0,0 +1,39 @@
+#include "TimerUtility.h"
+
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <time.h>
+#endif
+namespace Microsoft{
+    namespace MSR {
+        namespace CNTK {
+
+            //Returns the amount of milliseconds elapsed
+            unsigned long long Timer::MilliSecondElapsed()
+            {
+#ifdef WIN32
+                FILETIME ft;
+                LARGE_INTEGER li;
+
+                GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
+                li.LowPart = ft.dwLowDateTime;
+                li.HighPart = ft.dwHighDateTime;
+
+                unsigned long long ret = li.QuadPart;
+                ret -= 116444736000000000LL; // Make the values consistent with Linux. 
+                ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3) 
+
+                return ret;
+#else
+                timespec ts;
+                clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
+
+                UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
+
+                return ret;
+#endif
+            }
+        }
+    }
+}
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -4,7 +4,10 @@
 // </copyright>
 //

+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
 #pragma warning (disable: 4996)     // ^^ this does not seem to work--TODO: make it work
 #define _FILE_OFFSET_BITS 64        // to force fseeko() and ftello() 64 bit in Linux
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -49,8 +49,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_truncated = readerConfig("Truncated", "false");
        m_convertLabelsToTargets = false;

-        m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+        ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
+        m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;

+        for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
+        {
+            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
            if (m_numberOfuttsPerMinibatch < 1)
            {
                LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
@ -60,6 +64,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
            }
+        }
+
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];

        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // get the read method, defaults to "blockRandomize" other option is "rollingWindow"
        std::string readMethod(readerConfig("readMethod","blockRandomize"));

+        if (readMethod == "blockRandomize" && randomize == randomizeNone)
+        {
+            fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
+            randomize = randomizeAuto;
+        }
+
        // see if they want to use readAhead
        m_readAhead = readerConfig("readAhead", "false");

@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // now get the frame source. This has better randomization and doesn't create temp files
            m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
+			m_frameSource->setverbosity(verbosity);
            //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);

        }
@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        m_mbSize = mbSize;

+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
+
+        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
+        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
+        m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
+        m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
+        m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
+
        if (m_trainOrTest)
        {
            StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@ -1,3 +1,4 @@
+<<<<<<< HEAD
 //
 // <copyright file="HTKMLFReader.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
@ -111,4 +112,117 @@ public:
    void SetSentenceEnd(int /*actualMbSize*/){};
 };

+=======
+//
+// <copyright file="HTKMLFReader.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
+#pragma once
+#include "DataReader.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+class HTKMLFReader : public IDataReader<ElemType>
+{
+private:
+    msra::dbn::minibatchiterator* m_mbiter;
+    msra::dbn::minibatchsource* m_frameSource;
+    msra::dbn::minibatchreadaheadsource* m_readAheadSource;
+     msra::dbn::FileEvalSource* m_fileEvalSource; 
+    msra::dbn::latticesource* m_lattices;
+    map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
+    
+    vector<bool> m_sentenceEnd;
+    bool m_readAhead;
+    bool m_truncated;
+    vector<size_t> m_processedFrame;
+    intargvector m_numberOfuttsPerMinibatchForAllEpochs;
+    size_t m_numberOfuttsPerMinibatch;
+    size_t m_actualnumberOfuttsPerMinibatch;
+    size_t m_mbSize;
+    vector<size_t> m_toProcess;
+    vector<size_t> m_switchFrame;
+    bool m_noData;
+
+    bool m_trainOrTest; // if false, in file writing mode
+ 
+    std::map<LabelIdType, LabelType> m_idToLabelMap;
+    
+    bool m_partialMinibatch; // allow partial minibatches?
+    
+    std::vector<ElemType*> m_featuresBufferMultiUtt;
+    std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
+    std::vector<ElemType*> m_labelsBufferMultiUtt;
+    std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
+    std::vector<size_t> m_featuresStartIndexMultiUtt;
+    std::vector<size_t> m_labelsStartIndexMultiUtt;
+
+    std::vector<ElemType*> m_featuresBufferMultiIO;
+    std::vector<size_t> m_featuresBufferAllocatedMultiIO;
+    std::vector<ElemType*> m_labelsBufferMultiIO;
+    std::vector<size_t> m_labelsBufferAllocatedMultiIO;
+
+    std::map<std::wstring,size_t> m_featureNameToIdMap;
+    std::map<std::wstring,size_t> m_labelNameToIdMap;
+    std::map<std::wstring,size_t> m_nameToTypeMap;
+    std::map<std::wstring,size_t> m_featureNameToDimMap;
+    std::map<std::wstring,size_t> m_labelNameToDimMap;
+    // for writing outputs to files (standard single input/output network) - deprecate eventually
+    bool m_checkDictionaryKeys;
+    bool m_convertLabelsToTargets;
+    std::vector <bool> m_convertLabelsToTargetsMultiIO;
+    std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
+ 
+    size_t m_inputFileIndex;
+    std::vector<size_t> m_featDims;
+    std::vector<size_t> m_labelDims;
+
+    std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
+     
+    void PrepareForTrainingOrTesting(const ConfigParameters& config);
+    void PrepareForWriting(const ConfigParameters& config);
+    
+    bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    
+    void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+
+    bool ReNewBufferForMultiIO(size_t i);
+
+    size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} 
+    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+
+     void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
+
+    
+    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    enum InputOutputTypes
+    {
+        real,
+        category,
+    };
+
+
+
+public:
+    virtual void Init(const ConfigParameters& config);
+    virtual void Destroy() {delete this;}
+    virtual ~HTKMLFReader();
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
+    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+
+    virtual bool DataEnd(EndDataType endDataType);
+    void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
+    void SetSentenceEnd(int /*actualMbSize*/){};
+};
+
+>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
 }}}
--- a/DataReader/HTKMLFReader/utterancesource.h
+++ b/DataReader/HTKMLFReader/utterancesource.h
@ -768,6 +768,7 @@ private:
        if (chunkdata.isinram())
            return false;

+		if (verbosity)
        fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
        msra::util::attempt (5, [&]()   // (reading from network)
        {
@ -858,6 +859,7 @@ public:
            transcripts.clear();

            // return these utterances
+			if (verbosity > 0)
            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
@ -922,6 +924,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[lastchunk].windowend;
+			if (verbosity > 0)
            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource
        bool isinram() const { return !frames.empty(); }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
-        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
+        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
        {
            if (numutterances() == 0)
                throw std::logic_error ("requiredata: cannot page in virgin block");
@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
+				if (verbosity)
                fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
            }
            catch (...)
@ -568,6 +569,7 @@ private:
            return sweep;

        currentsweep = sweep;
+		if (verbosity>0)
        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep
@ -919,10 +921,11 @@ private:
            {
                auto & chunk = randomizedchunks[m][chunkindex];
                auto & chunkdata = chunk.getchunkdata();
+				if (verbosity)
                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                msra::util::attempt (5, [&]()   // (reading from network)
                {
-                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
+                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                });
            }
            chunksinram++;
@ -1029,6 +1032,7 @@ public:
                }
            }
            // return these utterances
+			if (verbosity > 0)
 				fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
@ -1107,6 +1111,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
+			if (verbosity)
            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
@ -1230,3 +1235,4 @@ public:
 };

 };};
+
--- a/DataReader/SequenceReader/SequenceParser.h
+++ b/DataReader/SequenceReader/SequenceParser.h
@ -15,7 +15,7 @@

 using namespace std;

-#define MAXSTRING 2048
+#define MAXSTRING 500000
 // UCI label location types
 enum LabelMode
 {
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
@ -62,10 +62,7 @@ template<class ElemType>
    // not yet found, add to the map
    if (found == labelInfo.mapLabelToId.end())
    {
-        labelInfo.mapLabelToId[labelValue] = labelInfo.idMax;
-        labelInfo.mapIdToLabel[labelInfo.idMax] = labelValue;
-        found = labelInfo.mapLabelToId.find(labelValue);
-        labelInfo.idMax++;
+        RuntimeError("%s not in vocabulary", labelValue.c_str());
    }
    return found->second;
 }
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj
@ -60,7 +60,7 @@
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@ -79,7 +79,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
@ -107,6 +107,7 @@
    <ClInclude Include="..\..\Common\Include\Eval.h" />
    <ClInclude Include="..\..\Common\Include\File.h" />
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
    <ClInclude Include="EvalReader.h" />
    <ClInclude Include="EvalWriter.h" />
    <ClInclude Include="stdafx.h" />
@ -127,6 +128,7 @@
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
    <ClCompile Include="..\cn\ComputationNode.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
@ -19,6 +19,9 @@
    <ClCompile Include="..\..\Common\File.cpp">
      <Filter>Common</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="EvalReader.h" />
@ -38,6 +41,9 @@
    <ClInclude Include="..\..\Common\Include\fileutil.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
--- a/MachineLearning/cn/ComputationNode.h
+++ b/MachineLearning/cn/ComputationNode.h
@ -3111,6 +3111,9 @@ protected:  \
            inputGradientValues.Print("child Gradient-in/out");
            inputFunctionValues.Print("child Function values");
 #endif
+            //currently we only support one combination when the input is sparse.
+            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol);

                Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
 #if DUMPOUTPUT
--- a/MachineLearning/cn/NetworkDescriptionLanguage.h
+++ b/MachineLearning/cn/NetworkDescriptionLanguage.h
@ -257,7 +257,12 @@ public:
            bool optParam = param->GetType() == ndlTypeOptionalParameter;
            if (optParam && !_stricmp(param->GetName().c_str(), name.c_str()))
            {
-                return param->GetValue();
+                auto paramValue = param->GetValue();
+                auto resolveParamNode = m_parent->ParseVariable(paramValue, false);
+                if (resolveParamNode != nullptr)
+                    return resolveParamNode->GetScalar();
+                else
+                    return paramValue;
            }
        }
        return ConfigValue(deflt);
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
@ -17,6 +17,7 @@
 #include "commandArgUtil.h"
 #include <chrono> 
 #include <random>
+#include "TimerUtility.h"

 #ifdef MPI_SUPPORT
 #include "mpi.h"
@ -121,6 +122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SGD(const ConfigParameters& configSGD)
        {
            ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
+			m_needToNormalizeLRByParallUtterance = false;
            floatargvector learningRatesPerMB = learningRatesPerMBStr;

            ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
@ -206,13 +208,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");

+			bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
+
            Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation, 
                clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor, 
                reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates,
                loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
                maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
                trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
-                rpi, learnRateAdjustInterval);
+                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode);
        }
    
        void setMomentum(float momentum)
@ -234,7 +238,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
            const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
            const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
-            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1)
+            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed=true)
        {
            numPrevLearnRates;
            m_mbSize=mbSize;
@ -272,6 +276,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            m_trainCriterionNodeName = trainCriterionNodeName;
            m_evalCriterionNodeName = evalCriterionNodeName;
+			m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;

            for (size_t i=0; i<m_mbSize.size(); i++)
                if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
@ -298,6 +303,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    m_learningRatesPerSample[i] = learningRatesPerMB[i]/m_mbSize[i];
                }
+				m_needToNormalizeLRByParallUtterance = true; 
            }
            m_momentumPerMB = 0.9f;
            if  (momentumPerMB.size() >0)
@ -521,6 +527,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (0 == myRank) // only needs to be done by one process
                    net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));

+			// first, we need to normalize the effect of nbruttsineachrecurrentiter
+			if (trainSetDataReader->NumberSlicesInEachRecurrentIter()>1 && m_needToNormalizeLRByParallUtterance)
+			{
+				for (auto & x : m_learningRatesPerSample)
+				{
+					x /= trainSetDataReader->NumberSlicesInEachRecurrentIter();
+				}
+			}
            bool learnRateInitialized = false;
            if (startEpoch > 0)
            {
@ -557,7 +571,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
            {
-                auto t_start_epoch = clock();
+                auto t_start_epoch = Timer::MilliSecondElapsed();

                // set other information to inputMatrices that can contrain information
                // used for class-based LM for clustring information
@ -600,12 +614,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #ifdef MPI_SUPPORT
 				INT32 mySamples = (INT32)
 #endif
+					fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n", (int)startEpoch,  learnRatePerSample, m_momentumPerMB);
                TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
                    criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
                    epochCriterion, epochEvalErrors, totalSamplesSeen);

-                auto t_end_epoch = clock();
-                ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
+                auto t_end_epoch = Timer::MilliSecondElapsed();
+                ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC);

                fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
                if (epochEvalErrors.size() == 1)
@ -800,7 +815,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            //compute
            //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize); 
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
+            // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
+			// [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing
+			if (m_useAllDataForPreComputedNode)
+				trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0); // using all the data
+			else 
+				trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0, m_epochSize); // using all the data

            while (trainSetDataReader->GetMinibatch(inputMatrices))
            {
@ -983,8 +1003,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(),0);
            PTaskGraphBuilder<ElemType>* ptaskGraphBuilder = NULL;
            
-            clock_t startReadMBTime = 0, startComputeMBTime=0;
-            clock_t endReadMBTime=0, endComputeMBTime=0; 
+            unsigned long long startReadMBTime = 0, startComputeMBTime=0;
+            unsigned long long  endReadMBTime = 0, endComputeMBTime = 0;

            //initialize statistics
            size_t totalEpochSamples = 0;
@ -1027,14 +1047,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }
            
-            startReadMBTime=clock();
+            startReadMBTime=Timer::MilliSecondElapsed();
            while (trainSetDataReader->GetMinibatch(inputMatrices))
            {
 #ifdef MPI_SUPPORT
                DecimateMinibatch(inputMatrices);
 #endif
-                endReadMBTime=clock();
-                startComputeMBTime=clock();
+                endReadMBTime=Timer::MilliSecondElapsed();
+                startComputeMBTime=Timer::MilliSecondElapsed();

                UpdateEvalTimeStamps(FeatureNodes);
                UpdateEvalTimeStamps(labelNodes);
@ -1113,12 +1133,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }


-                endComputeMBTime=clock();
+                endComputeMBTime=Timer::MilliSecondElapsed();
                numMBsRun ++;
                if (m_traceLevel > 0)
                {
-                    ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(CLOCKS_PER_SEC);
-                    ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/CLOCKS_PER_SEC;
+                    ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(MS_PER_SEC);
+                    ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/MS_PER_SEC;

                    readTimeInMBs += MBReadTime;
                    ComputeTimeInMBs += MBComputeTime;
@ -1149,7 +1169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
                    }
                }
-                startReadMBTime=clock();
+                startReadMBTime=Timer::MilliSecondElapsed();
                totalEpochSamples += actualMBSize;
                totalSamplesSeen += actualMBSize;

@ -1506,6 +1526,7 @@ protected:
    protected:

        floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside
+		bool			m_needToNormalizeLRByParallUtterance;			// only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
        intargvector m_mbSize;
        size_t m_epochSize;
        size_t m_maxEpochs;
@ -1557,6 +1578,8 @@ protected:
        ElemType m_gradientCheckSigDigit;

        bool m_validateAfterModelReloading;
+
+		bool m_useAllDataForPreComputedNode;
    };
    template class SGD<float>; 
    template class SGD<double>;
--- a/MachineLearning/cn/SimpleEvaluator.h
+++ b/MachineLearning/cn/SimpleEvaluator.h
@ -102,7 +102,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                evalResultsLastMBs.push_back((ElemType)0);

            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
-            dataReader.SetNbrSlicesEachRecurrentIter(1);

            for (int i=0; i<evalNodes.size(); i++)
            {
--- a/MachineLearning/cn/SynchronousExecutionEngine.h
+++ b/MachineLearning/cn/SynchronousExecutionEngine.h
@ -228,7 +228,7 @@ public:
                nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
                nodePtr->NeedGradient() = false;
            }
-            else if (pass == ndlPassFinal)
+            else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
            {
                ElemType val = parameter[0]->GetScalar();
                nodePtr->FunctionValues().SetValue(val);
--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (m_children.size() != 3) 
                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");

-            if (Inputs(0)->OperationName() != L"SparseInputValue" 
-                && Inputs(0)->OperationName() != L"InputValue")
+            if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
+                && Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");

            if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols()  &&  // input and matrix can be timed
--- a/MachineLearning/cn/cn.cpp
+++ b/MachineLearning/cn/cn.cpp
@ -28,6 +28,7 @@
 #include "io.h"
 #endif
 #include "hostname.h"
+#include "buildinfo.h"
 #ifdef LEAKDETECT
 #include "vld.h" // for memory leak detection
 #endif
@ -619,8 +620,26 @@ int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
 }
 #endif

+void PrintBuiltInfo()
+{
+	fprintf(stderr, "-------------------------------------------------------------------\n");
+	fprintf(stderr, "Build info: \n\n");
+	fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
+	fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
+	fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
+	fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
+#ifdef _GIT_EXIST
+	fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
+	fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
+#endif
+	fprintf(stderr, "-------------------------------------------------------------------\n");
+
+}
+
+
 int wmain(int argc, wchar_t* argv[])
 {
+
    try
    {
 #ifdef MPI_SUPPORT
@ -647,6 +666,8 @@ int wmain(int argc, wchar_t* argv[])

        // get the command param set they want
        wstring logpath = config("stderr", L"");
+		//  [1/26/2015 erw, add done file so that it can be used on HPC]
+		wstring DoneFile = config("DoneFile", L"");
        ConfigArray command = config("command", "train");

        if (logpath != L"")
@ -663,9 +684,14 @@ int wmain(int argc, wchar_t* argv[])
                oss << myRank;
                logpath += L"rank" + oss.str();
            }
+
 			RedirectStdErr(logpath);
        }

+
+		PrintBuiltInfo();
+
+
        std::string timestamp = TimeDateStamp();

        if (myRank == 0) // main process
@ -714,8 +740,15 @@ int wmain(int argc, wchar_t* argv[])
            DoCommand<double>(config);
        else
            RuntimeError("invalid precision specified: %s", type.c_str());
+
+		// still here , write a DoneFile if necessary 
+		if (!DoneFile.empty()){
+			FILE* fp = fopenOrDie(DoneFile.c_str(), L"w");
+			fprintf(fp, "successfully finished at %s on %s\n",  TimeDateStamp().c_str(),GetHostName().c_str());
+			fcloseOrDie(fp);
 		}
-    catch(const std::exception &err)
+	}
+	catch (const std::exception &err)
 	{
        fprintf(stderr, "EXCEPTION occurred: %s", err.what());
 #ifdef _DEBUG
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@ -139,6 +139,9 @@
      <TreatOutputAsContent>true</TreatOutputAsContent>
      <Message>Copy content files to target directory</Message>
    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
@ -199,6 +202,9 @@
      <Message>
      </Message>
    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemGroup>
    <Text Include="DefaultMacros.txt" />
@ -216,6 +222,7 @@
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
    <ClInclude Include="..\..\Common\Include\hostname.h" />
    <ClInclude Include="..\..\Common\Include\nvml.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
    <ClInclude Include="CompositeComputationNode.h" />
    <ClInclude Include="ComputationNetwork.h" />
    <ClInclude Include="ComputationNetworkHelper.h" />
@ -249,6 +256,7 @@
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
    <ClCompile Include="cn.cpp" />
    <ClCompile Include="ComputationNode.cpp" />
    <ClCompile Include="ModelEditLanguage.cpp" />
--- a/MachineLearning/cn/cn.vcxproj.filters
+++ b/MachineLearning/cn/cn.vcxproj.filters
@ -43,6 +43,9 @@
    <ClCompile Include="NetworkDescriptionLanguage.cpp">
      <Filter>Network</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\Common\Include\basetypes.h">
@ -138,6 +141,9 @@
    <ClInclude Include="..\..\Common\Include\hostname.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Text Include="modelEditor.txt">
--- a/MachineLearning/cn/prebuild.bat
+++ b/MachineLearning/cn/prebuild.bat
@ -0,0 +1,30 @@
+@echo off 
+
+
+echo #ifndef _BUILDINFO_H > buildinfo.h
+echo #define _BUILDINFO_H >> buildinfo.h 
+
+
+FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i           
+:: assuming hostname always exists 
+
+:: not sure whether git in path ? 
+git --version 2 > nul 
+if not %ERRORLEVEL% == 9909 (
+    echo #define _GIT_EXIST >> buildinfo.h
+    FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i
+    FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i
+    echo #define _BUILDBRANCH_  "%BRANCH%"      >> buildinfo.h
+    echo #define _BUILDSHA1_    "%COMMIT%"      >> buildinfo.h
+) 
+
+
+echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h 
+echo #define _BUILDMACHINE_ "%HOST%"    >> buildinfo.h
+
+set a=%~dp0
+set buildpath="%a:\=\\%"
+echo #define _BUILDPATH_    %buildpath%     >> buildinfo.h
+
+
+echo #endif >> buildinfo.h
--- a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
+++ b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
@ -78,6 +78,53 @@ namespace CNTKMathTest
            Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
        }

+        TEST_METHOD(CPUMatrixDenseTimesSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024, 2048, -3, 0.1, 0), 0);
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C1(C);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+            Assert::IsTrue(C1.IsEqualTo(C, 0.0001));
+
+            alpha = 3.3, beta = 1.3;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(C1.IsEqualTo(C, 0.00005));
+        }
+        
+        TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048, 1024, -3, 0.1, 0), 0);
+
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsCsc = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsBlock(CPUDEVICE);
+            AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = true;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001));
+        }
+
        TEST_METHOD(MatrixSparseTimesSparse)
        {
            Matrix<float> Ad;
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -111,7 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
        {
            m_blockSize = 0;      
-            m_blockVal = NULL;
+            m_pArray = NULL;
            m_blockIds = NULL;
        }
    }
@ -125,6 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            throw std::logic_error("CPUSparseMatrix:  unsupported sparse matrix format");
        }
        m_format = format;
+        m_default = defaultElem();
        ZeroInit();
    }

@ -160,8 +161,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }  
        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
        {
-            if(m_blockVal != NULL) 
-                delete[] m_blockVal;
+            if (m_pArray != NULL)
+                delete[] m_pArray;
            if(m_blockIds != NULL) 
                delete[] m_blockIds;
        }
@ -219,16 +220,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType>
    ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
    {
        return m_pArray;
    }
-        else
-        {
-            return m_blockVal;
-        }
-    }

    template<class ElemType>
    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
@ -280,16 +274,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (keepExistingValues && m_elemSizeAllocated > 0)
                {
                    assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
-                    memcpy(blockVal, m_blockVal, NzSize());
+                    memcpy(blockVal, m_pArray, NzSize());
                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
                }

-                if (m_blockVal != NULL)
-                    delete[] m_blockVal;
+                if (m_pArray != NULL)
+                    delete[] m_pArray;
                if(m_blockIds != NULL) 
                    delete[] m_blockIds;

-                m_blockVal = blockVal;
+                m_pArray = blockVal;
                m_blockIds = blockIds;
            }

@ -457,10 +451,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    { // h range over hidden layer 
                        if(first == true) 
                        {
-                            c.m_blockVal[pos] = alpha*lhs(h, j)*val;
+                            c.m_pArray[pos] = alpha*lhs(h, j)*val;
                        } else 
                        {
-                            c.m_blockVal[pos] += alpha*lhs(h, j)*val;
+                            c.m_pArray[pos] += alpha*lhs(h, j)*val;
                        }
                        pos++;
                    }
@ -522,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t start = j * len;
                for(size_t p = start; p < start+len; p++) 
                {
-                    ElemType val = lhs.m_blockVal[p];
+                    ElemType val = lhs.m_pArray[p];

                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
@ -537,6 +531,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }


+    template<class ElemType>
+    bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AreEqual: one of the input matrices is empty.");
+
+        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+
+        bool result = true;
+
+        #pragma omp parallel for
+        foreach_coord(i, j, a)
+        {
+            if (abs(a(i, j) - b(i, j)) > threshold)
+            {
+                result = false;
+                break;
+            }
+        }
+
+        return result;
+    }
+
    // a: H x No: H is hidden layer size and No is mini-batch size
    // weight: V x H, V is vocab size
    // label: V x No
@ -698,11 +716,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                { // h range over hidden layer 
                    if(first == true) 
                    {
-                        grd.m_blockVal[pos] = input(h, j)*error.m_pArray[p];
+                        grd.m_pArray[pos] = input(h, j)*error.m_pArray[p];
                    } 
                    else 
                    {
-                        grd.m_blockVal[pos] += input(h, j)*error.m_pArray[p];
+                        grd.m_pArray[pos] += input(h, j)*error.m_pArray[p];
                    }
                    pos++;
                }
@ -735,11 +753,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t start = j* len;
                for(size_t p = start; p < start+len; p++) 
                {
-                    ElemType val = m_blockVal[p];
+                    ElemType val = m_pArray[p];
                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
                    c(row, col) = (1-momentum)*val + momentum*c(row, col);
-                    m_blockVal[p] = c(row, col);
+                    m_pArray[p] = c(row, col);
                }
            }
        } 
@ -790,14 +808,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t start = j* len;
                for(size_t p = start; p < start+len; p++) 
                {
-                    ElemType val = m_blockVal[p];
+                    ElemType val = m_pArray[p];

                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
                    ElemType adenorm = c(row, col); 
                    adenorm += val * val; 
                    val = val / (floor + sqrt(adenorm)); 
-                    m_blockVal[p] = val;
+                    m_pArray[p] = val;
                    c(row, col) = adenorm; 
                }
            }
@ -818,13 +836,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t start = j* len;
                for (size_t p = start; p < start+len; p++)
                {
-                    if (m_blockVal[p] > locThresholdPos)
+                    if (m_pArray[p] > locThresholdPos)
                    {
-                        m_blockVal[p] = locThresholdPos;
+                        m_pArray[p] = locThresholdPos;
                    }
-                    else if (m_blockVal[p] < locTHresholdNeg)
+                    else if (m_pArray[p] < locTHresholdNeg)
                    {
-                        m_blockVal[p] = locTHresholdNeg;
+                        m_pArray[p] = locTHresholdNeg;
                    }
                }
            }
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -34,6 +34,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUSparseMatrix(const MatrixFormat format);
        CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
        
+        
        ~CPUSparseMatrix();

    public:
@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        
        static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);

+        static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+
        /// sum(vec(a).*vec(b))
        static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/) { NOT_IMPLEMENTED; }
        
@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
        void Reset();

+        inline ElemType defaultElem()
+        {
+            ElemType default;
+            memset(&default, 0, sizeof(ElemType));
+            return default;
+        }
+
+        const ElemType& operator() (const size_t row, const size_t col) const
+        {
+            if (col >= m_numCols || row >= m_numRows)
+            {
+                throw std::runtime_error("Position outside matrix dimensions");
+            }
+
+            if (m_format == MatrixFormat::matrixFormatSparseCSC)
+            {
+                size_t start = m_compIndex[col];
+                size_t end = m_compIndex[col + 1];
+                for (size_t p = start; p < end; p++)
+                {
+                    size_t i = m_unCompIndex[p];
+                    if (i == row)
+                    {
+                        return m_pArray[p];
+                    }
+                }
+
+                return m_default;
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+
    public:
        void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
        void Adagrad(CPUMatrix<ElemType>& c);
@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    public:
        const ElemType* NzValues() const { return m_pArray; }
-        ElemType* NzValues() { return m_pArray; }
+        inline ElemType* NzValues() { return m_pArray; }
        size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use

        CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
@ -140,8 +178,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format

        size_t m_blockSize; //block size
-        ElemType *m_blockVal; //block values
        size_t *m_blockIds; //block ids
+
+        ElemType m_default;
    };

    typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -3356,45 +3356,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #pragma endregion Static BLAS Functions


-    //#pragma region File << and >> operators
-    //    template<class ElemType>
-    //    File& operator>>(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.GetMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        size_t elsize;
-    //        stream>>elsize;
-    //        if (sizeof(ElemType)!=elsize)
-    //            throw std::runtime_error("Template argument size doesn't match those in file");
-    //        std::wstring matrixName;
-    //        size_t numRows, numCols;
-    //        stream>>matrixName>>numRows>>numCols;
-    //        ElemType* d_array = new ElemType[numRows*numCols];
-    //        for (long i=0;i<numRows*numCols;++i)
-    //            stream>>d_array[i];
-    //        stream.GetMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        us.SetValue(numRows,numCols,d_array, matrixFlagNormal);
-    //        us.m_matrixName = matrixName;
-    //        return stream;
-    //    }
-    //
-    //    template<class ElemType>
-    //    File& operator<<(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.PutMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        stream<<sizeof(ElemType)<<us.m_matrixName<<us.m_numRows<<us.m_numCols;
-    //        ElemType *d_array = us.CopyToArray();
-    //        for (long i=0;i<us.GetNumElements();++i)
-    //            stream<<d_array[i];
-    //        stream.PutMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        return stream;
-    //    }
-    //
-    //#pragma endregion File << and >> operators
-
    template class GPUMatrix<float>; 
    template class GPUMatrix<double>;
    template class DeviceBoundNumber<float>;
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2250,37 +2250,12 @@ __global__ void _sparseCSRElemMulDense(
    }
 }

-// forward pass from feature to hidden layer
-/*template<class ElemType>
-__global__ void _denseMulSparseCSCToDense(
-    ElemType alpha,
-    const ElemType* lhs,
-    int numrows,
-    int numcols,
-    const GPUSPARSE_INDEX_TYPE* row,
-    ElemType* c)
-{
-    int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
-    int tStart = loadPerThread * threadIdx.x;
-    int tEnd = min(numrows, loadPerThread + tStart);
-
-    int p = blockIdx.x;
-    int i = row[p];
-    int j = blockIdx.x;
-
-    for (int h = tStart; h < tEnd; h++) 
-    {
-        ElemType res = alpha * lhs[IDX2C(h, i, numrows)]; 
-        atomicAdd(&c[IDX2C(h,j,numrows)], res);
-    }
-}*/

 //c = alpha * op(a) * op(b) + beta*c
 //this function can be further improved by using shared memory
 template<class ElemType>
 __global__ void _denseMultSparseCSCAndWeightedAddToDense(
    int m, //rowDense
-    int k,  //colDense = rowSparse
    int n,   //colSparse
    ElemType alpha,
    const ElemType* a,  //dense
@ -2311,62 +2286,77 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
 }

 // backward pass from hidden layer to feature weight
+//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
+//assume resultValues are 0-initialized
 template<class ElemType>
 __global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
-	ElemType alpha,
-    ElemType* lhs,
-    size_t nrs,
-	ElemType* rhsNZValues,
-    const GPUSPARSE_INDEX_TYPE* row,
-    const size_t* rowIdx,
-    ElemType* blockVal,
-    size_t* blockIds)
+    const ElemType alpha,
+    const ElemType* lhsValues,
+    const size_t numRowsLhs,
+    const size_t numColsRhs,
+    const ElemType* rhsNZValues,
+    const GPUSPARSE_INDEX_TYPE* rhsRows,
+    const GPUSPARSE_INDEX_TYPE* rhsCols,
+    const size_t* rhsRowIdx,
+    ElemType* resultValues,
+    size_t* resultBlockIds)
 {
-    int p = blockIdx.x;
-    int i = row[p];
-    int ii = rowIdx[p];
-    int j = blockIdx.x;
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    const LONG64 lhsCol = index / numRowsLhs; //rhsCol == lhsCol
+    if (lhsCol >= numColsRhs)
+        return;
+    const LONG64 lhsRow = index - numRowsLhs*lhsCol; //resultRow == lhsRow

-    int load = (nrs+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)nrs, load + pStart);
+    //each thread handles one [row, col] combination
+    ElemType lhsValue = alpha*lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];

-    for(int h = pStart; h < pEnd; h++) 
+    LONG64 start = rhsCols[lhsCol]; //rhsCol == lhsCol
+    LONG64 end = rhsCols[lhsCol + 1];
+
+    for (LONG64 p = start; p < end; p++)
    {
-        ElemType temp = alpha*lhs[IDX2C(h, j, nrs)]*rhsNZValues[p];    
-        atomicAdd(&blockVal[ii*nrs+h], temp);
-        blockIds[ii] = i;
+        LONG64 rhsRow = rhsRows[p]; 
+        ElemType rhsVal = rhsNZValues[p];
+        LONG64 resultCol = rhsRowIdx[p]; //resultCol == rhsRow maps to columnid 
+        resultBlockIds[resultCol] = rhsRow;  //indicate which colmn it actually points to
+
+        //assume resultValues are 0-initialized
+        atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal);
    }
 }

+
 // gradients update
 template<class ElemType>
-__global__ void _scaleSparseAndAddToDense(    
-    ElemType alpha,
-    bool blockCol,
-    ElemType* blockVal,
-    size_t* blockIds,
-    size_t len,
-    ElemType* rhs,
-    size_t numrows)
+__global__ void _scaleSparseBlockAndAddToDense(    
+    const ElemType alpha,
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    const ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
 {
-    int ii = blockIdx.x;
-    int i = blockIds[ii];
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)len, load + pStart);
-
-    for(int h = pStart; h < pEnd; h++) 
-    {   ElemType temp = alpha*blockVal[ii*len + h];
-        if(blockCol)
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
    {
-            atomicAdd(&rhs[IDX2C(h, i, numrows)], temp);
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
    }
    else
    {
-            atomicAdd(&rhs[IDX2C(i, h, numrows)], temp);
-        }
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
    }
+    rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
 }

 // compute predictions in cross entory node
@ -2638,30 +2628,36 @@ __global__ void _inplaceTruncate(
 }

 template<class ElemType>
-__global__ void _normalGrad(
-    bool isBlockCol,
-    size_t len,
+__global__ void _normalGradForSparseBlock(
    const ElemType momentum,
-    size_t* blockIds,
-    ElemType* blockVal,
-    ElemType* c,
-    size_t numrows)
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
 {
-    int j = blockIdx.x;
-    int i = blockIds[j];
-    int start = j * len;
-
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pLen = min((int)len, load + pStart);
-
-    for(int p = start+pStart; p < start+pLen; p++) 
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
    {
-        int row = isBlockCol ? (p - start) : i;
-        int col = isBlockCol ? i: (p - start);
-        c[IDX2C(row, col, numrows)] = (1-momentum)*blockVal[p] + momentum*c[IDX2C(row, col, numrows)];
-        blockVal[p] = c[IDX2C(row, col, numrows)];
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
    }
+    else
+    {
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
+    }
+    rhs[IDX2C(row, col, numRows)] = (1 - momentum)*lhsValues[index] + momentum*rhs[IDX2C(row, col, numRows)];
+    lhsValues[index] = rhs[IDX2C(row, col, numRows)];
 }

 static __inline__ __device__ double atomicAdd(double* address, double val)
@ -3263,4 +3259,147 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
 }
 */

+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShiftNeg(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const int NTPlusOne,
+	const int BS)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= BS)
+		return;
+
+	if (idx == 0)
+	{
+		// this is row-0. No need to shift
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[idy];
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		int tmpidy = (idy + cs) % BS;
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[tmpidy];
+	}
+}
+
+template<class ElemType>
+__global__ void _innerProductWithShiftNeg(
+	ElemType* c,
+	const ElemType* a,
+	const ElemType* b,
+	const long N, //a.GetNumRows();
+	const long M, //a.GetNumCols();
+	const long shift,
+	const long NTPlusOne
+	)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= M)
+		return;
+
+	ElemType sum = 0;
+	long index_a = 0;
+	long index_b = 0;
+	long col_a = 0;
+	long col_b = 0;
+	if (idx == 0)
+	{
+		// this is row 0. No need to shift
+		// the product of a(:,idy) dot b(:,idy)
+		col_a = idy;
+		for (long i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			sum += a[index_a] * b[index_a];
+		}
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		col_a = idy;
+		col_b = (idy + cs) % M;
+		for (int i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			index_b = IDX2C(i, col_b, N);
+			sum += a[index_a] * b[index_b];
+		}
+	}
+	c[IDX2C(idx, idy, NTPlusOne)] = sum;
+
+}
+
+template<class ElemType>
+__global__ void _getARowByIndex(
+	ElemType* us,
+	const ElemType* a,
+	const int O, // a's rows
+	const int P, // a's cols
+	const int m // the m-th row of a
+	)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= P)
+		return;
+	//	us[id] = a[id] * b[id];
+	us[id] = a[IDX2C(m, id, O)];
+}
+
+
+template<class ElemType>
+__global__ void _conductRowElementMultiplyWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int O, // b's rows
+	const int P, // b's cols
+	const int shift,
+	const bool isafixed)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= O || idy >= P)
+		return;
+
+	int tmpidy = (idy + shift) % P;
+	if (isafixed)
+	{
+		// we fix a, and shift b
+		us[IDX2C(idx, idy, O)] = a[idy] * b[IDX2C(idx, tmpidy, O)];
+	}
+	else
+	{
+		// we fix b, but shift a
+		us[IDX2C(idx, idy, O)] = a[tmpidy] * b[IDX2C(idx, idy, O)];
+	}
+
+}
+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const LONG64 N)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= N)
+		return;
+
+	int tmpidb = (id + shift) % N;
+	us[id] = a[id] * b[tmpidb];
+}
+
+
+
 #endif // !CPUONLY
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_matrixName=nullptr;

        m_blockSize = 0;
-        m_blockVal = nullptr;
        m_blockIds = nullptr;

        m_expandedSize = 0;
@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
            }

-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));

        }
        else if (this->GetFormat() == matrixFormatSparseCSC)
@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
            }

-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
        }
        else
            NOT_IMPLEMENTED;
@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_matrixName=moveFrom.m_matrixName;

        m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
        m_blockIds = moveFrom.m_blockIds;

        m_expandedSize = moveFrom.m_expandedSize;
@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_matrixName=moveFrom.m_matrixName;

        m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
        m_blockIds = moveFrom.m_blockIds;

        m_expandedSize = moveFrom.m_expandedSize;
@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if(m_pArray != nullptr) 
            CUDACALL(cudaFree(m_pArray));

-        if(m_blockVal != nullptr) 
-            CUDACALL(cudaFree(m_blockVal));
        if(m_blockIds != nullptr) 
            CUDACALL(cudaFree(m_blockIds));
        if (m_rowToId != nullptr)
@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //-------------------------------------------------------------------------
    // Start of new GPU Sparse Matrix code 
    //-------------------------------------------------------------------------
-
-    template<class ElemType>
-    ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR) 
-        {
-            return m_pArray;
-        }  
-        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            return m_blockVal;
-        }
-        else
-            NOT_IMPLEMENTED;
-    }
-
    template<class ElemType>
    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
    {
@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_totalBufferSizeAllocated = bufferSizeNeeded;
                m_elemSizeAllocated = numNZElemToReserve;
            }
+            else
+            {
+                m_elemSizeAllocated = ElemCountFromBufferSize();
+            }
        } 
        else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
        {
            if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
            {
-                if (m_blockVal != nullptr)
-                    CUDACALL(cudaFree(m_blockVal));
+                if (m_pArray != nullptr)
+                    CUDACALL(cudaFree(m_pArray));
                if (m_blockIds != nullptr)
                    CUDACALL(cudaFree(m_blockIds));
                if (m_block2UniqId != nullptr)
@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                PrepareDevice();
                size_t newCompIndexSize = max(numRows, numCols) + 1;
-                CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
+                CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
                CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));

@ -828,7 +811,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            rowToId[i] = indexer[row];
        }  
-
+        m_blockSize = indexer.size();
        CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
    }

@ -928,7 +911,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (do_sync)    CUDACALL(cudaEventCreate(&done));
                _denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
                    m, //rowDense
-                    k,  //colDense = rowSparse
                    n,   //colSparse
                    alpha,
                    reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
@ -991,25 +973,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (rhs.GetFormat() != matrixFormatSparseCSC)
                NOT_IMPLEMENTED;

+            lhs.PrepareDevice();
+
            c.SetFormat(matrixFormatSparseBlockCol);
-            c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
+            //c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
+            c.m_blockSize = rhs.m_blockSize;
            c.m_nz = m*c.m_blockSize;
            c.Resize(m, n, c.m_nz);
-            CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
+            CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
            CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));

+            LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            int blocksPerGrid = rhs.GetNumNZElements();
-            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
+            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
                    alpha,
                    lhs.BufferPointer(),
                    m,
-                rhs.BufferPointer(),
+                    l,
+                    rhs.NzValues(),
                    rhs.RowLocation(),
+                    rhs.ColLocation(),
                    rhs.m_rowToId,
-                c.m_blockVal, 
+                    c.NzValues(),
                    c.m_blockIds);
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));
@ -1028,25 +1017,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
    {
-        if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
+        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols())
+            throw std::logic_error("ScaleAndAdd: dimension mismatch");
+
+        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId())
            throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");

        if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow) 
        {
-            size_t len = (lhs.m_format == matrixFormatSparseBlockCol) ? lhs.GetNumRows(): lhs.GetNumCols();
            bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);

            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            size_t blocksPerGrid = lhs.m_blockSize;
-            _scaleSparseAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
+            LONG64 N = (LONG64)lhs.GetNumNZElements(); 
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+            _scaleSparseBlockAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
                alpha,
                blockCol,
-                lhs.m_blockVal,
+                lhs.GetNumRows(),
+                lhs.GetNumCols(),
+                lhs.m_blockSize,
+                lhs.NzValues(),
                lhs.m_blockIds,
-                len,
-                rhs.BufferPointer(),
-                rhs.GetNumRows());
+                rhs.BufferPointer());
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));
@ -1104,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            label.m_block2Id,
            cls.BufferPointer(),
            idx2cls.BufferPointer(),            
-            etp.m_pArray,
+            etp.NzValues(),
            etp.MajorIndexLocation(),
            etp.SecondaryIndexLocation());

@ -1184,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        grd.m_blockSize = label.m_blockSize;      
        grd.m_nz = nz;
-        CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
+        CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
        CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));

        cudaEvent_t done = nullptr;  
@ -1203,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            idx2cls.BufferPointer(),              
            input.BufferPointer(),
            input.GetNumRows(),
-            grd.m_blockVal, 
+            grd.BufferPointer(), 
            grd.m_blockIds);
        if (do_sync)    CUDACALL(cudaEventRecord(done));
        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
@ -1221,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
            ElemType * values = NzValues();
-            if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
-                values = m_blockVal;
            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
@ -1247,19 +1239,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) 
        {
-            size_t blocksPerGrid = m_blockSize;
            bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
-            size_t len = isBlockCol ? GetNumRows(): GetNumCols();
            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            _normalGrad<ElemType><<<blocksPerGrid,threadsPerBlock>>>(
-                isBlockCol,
-                len,
+            LONG64 N = (LONG64)GetNumNZElements();
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+
+            _normalGradForSparseBlock<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
                momentum,
+                isBlockCol,
+                GetNumRows(),
+                GetNumCols(),
+                m_blockSize,
+                NzValues(),
                m_blockIds,
-                m_blockVal,
-                c.BufferPointer(),
-                c.GetNumRows());                        
+                c.BufferPointer());
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // in memory format is always in the following order:
        // Non-zero data elements, Full index locations, compressed index locations
        // In CSR row data is compressed, in CSC col data is compressed
-        const ElemType* NzValues() const {return m_pArray;}
-        ElemType* NzValues() {return m_pArray;}
-        size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
+        inline const ElemType* NzValues() const {return m_pArray;}
+        inline ElemType* NzValues() {return m_pArray;}
+        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use

        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
        size_t MajorIndexCount() const { return m_nz; }
@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t BufferSizeNeeded(const size_t numNZ) const 
        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }

-        size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
-        ElemType* BufferPointer() const;
+        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
+        inline ElemType* BufferPointer() const { return m_pArray; }

        // the column and row locations will swap based on what format we are in. Full index always follows the data array
        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
    public:
        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
-        size_t GetNumNZElements() const {return m_nz;}
+        inline size_t GetNumNZElements() const {return m_nz;}

        //Sets sparse matrix in CSR format. this acts as deep copy
        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t m_totalBufferSizeAllocated;

        size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
        size_t *m_blockIds; //block ids
        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.

--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -733,6 +733,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (m_CPUSparseMatrix == nullptr)
                {
                    m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat); 
+
+                    if (GetMatrixType() == MatrixType::DENSE && m_CPUMatrix != nullptr)
+                    {
+                        m_CPUSparseMatrix->Resize(GetNumRows(), GetNumCols());
+                        CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
+                    }
+                    else
+                    {
+                        // TODO: Assign Sparse from Sparse!
+                    }
+
                    delete m_CPUMatrix;
                    m_CPUMatrix = nullptr;
                }
@ -801,6 +812,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    template<class ElemType>
+    void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
+    {
+        foreach_coord(row, col, from)
+        {
+            auto val = from(row, col);
+            dest.SetValue(row, col, val);
+        }
+    }

    template<class ElemType>
    ElemType Matrix<ElemType>::Get00Element() const
@ -3992,7 +4012,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nullptr,
                return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
                return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
-                NOT_IMPLEMENTED; return false ,
+                return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
                return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold)
                );                
            }
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -45,6 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
+        static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);

    public:
        //Constructors, destructors and other static matrix builders