diff --git a/DataReader/SequenceReader/SequenceReader.cpp b/DataReader/SequenceReader/SequenceReader.cpp
index 9967deccb..c181bd7b7 100644
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
@@ -1777,8 +1777,8 @@ bool BatchSequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<E
             // vector of feature data goes into matrix column
             size_t idx = (size_t)m_featureData[j];
 
-            //if (matrices.find(m_featuresName) != matrices.end())
-                features.SetValue(idx, j, (ElemType)1);
+            features.SetValue(idx, j, (ElemType)1);
+            SetSentenceBegin(idx, j, actualmbsize);
         }
         
         features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false);
@@ -1947,7 +1947,6 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
             labels->SetValue(0, j, (ElemType)wrd); 
 
             SetSentenceEnd(wrd, j, actualmbsize);
-            SetSentenceBegin(wrd, j, actualmbsize);
 
             if (class_size > 0)
             {
diff --git a/ExampleSetups/LM/LSTMLM/global.config b/ExampleSetups/LM/LSTMLM/global.config
new file mode 100644
index 000000000..aa81c1c65
--- /dev/null
+++ b/ExampleSetups/LM/LSTMLM/global.config
@@ -0,0 +1,2 @@
+ExpDir=C:\CNTKExp\LSTMLM\log\
+DataDir=C:\CNTKExp\RNN\data\PennTreeBank
\ No newline at end of file
diff --git a/ExampleSetups/LM/LSTMLM/lstmlm.gpu.config b/ExampleSetups/LM/LSTMLM/lstmlm.gpu.config
new file mode 100644
index 000000000..c2bdca121
--- /dev/null
+++ b/ExampleSetups/LM/LSTMLM/lstmlm.gpu.config
@@ -0,0 +1,414 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+
+# command=dumpNodeInfo
+#command=train
+#command=test
+command=train:test
+#command=writeWordAndClassInfo
+
+type=double
+
+writeWordAndClassInfo=[
+    action=writeWordAndClass
+    inputFile=$DataFolder$\vocab.txt
+    outputWord2Cls=$ExpFolder$\word2cls.txt
+    outputCls2Index=$ExpFolder$\cls2idx.txt
+    vocabSize=10000
+    nbrClass=50
+    printValues=true
+]
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=Auto
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=Auto
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
\ No newline at end of file
diff --git a/ExampleSetups/LM/RNNLM/CPU/fnnlm.config b/ExampleSetups/LM/RNNLM/CPU/fnnlm.config
new file mode 100644
index 000000000..a2cd1da61
--- /dev/null
+++ b/ExampleSetups/LM/RNNLM/CPU/fnnlm.config
@@ -0,0 +1,413 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+# command=dumpNodeInfo
+command=train
+#command=test
+#command=train:test
+#command=writeWordAndClassInfo
+
+type=double
+
+writeWordAndClassInfo=[
+    action=writeWordAndClass
+    inputFile=$DataFolder$\vocab.txt
+    outputWord2Cls=$ExpFolder$\word2cls.txt
+    outputCls2Index=$ExpFolder$\cls2idx.txt
+    vocabSize=10000
+    nbrClass=50
+    printValues=true
+]
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+    # rnnType=LSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
\ No newline at end of file
diff --git a/ExampleSetups/LM/RNNLM/GPU/fnnlm.config b/ExampleSetups/LM/RNNLM/GPU/fnnlm.config
new file mode 100644
index 000000000..d1da353e6
--- /dev/null
+++ b/ExampleSetups/LM/RNNLM/GPU/fnnlm.config
@@ -0,0 +1,403 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+# command=dumpNodeInfo
+# command=train
+# command=test
+command=train:test
+
+type=double
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=0
+    epochSize=4430000
+    # which is 886 * 5000
+#    recurrentLayer=1
+    defaultHiddenActivity=0.0
+    useValidation=true
+    rnnType=CLASSLM
+    # rnnType=LSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
\ No newline at end of file
diff --git a/ExampleSetups/LM/RNNLM/GPU/rnnlm.config b/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
index d1da353e6..56e54db9e 100644
--- a/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
+++ b/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
@@ -28,7 +28,7 @@ train=[
     deviceId=0
     epochSize=4430000
     # which is 886 * 5000
-#    recurrentLayer=1
+    recurrentLayer=1
     defaultHiddenActivity=0.0
     useValidation=true
     rnnType=CLASSLM
diff --git a/MachineLearning/cn/SGD.h b/MachineLearning/cn/SGD.h
index 17904594c..629a500bf 100644
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
@@ -184,8 +184,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             rpi.max = (double)configSGD("rms_wgt_max", "10.0");
             rpi.gamma = (double)configSGD("rms_gamma", "0.99");
 
-            bool needAveMultiplier = (bool)configSGD("needAveMultiplier", "true");
+            bool needAveMultiplier = (bool)configSGD("normWithAveMultiplier", "true");
             ElemType L2RegWeight = (ElemType)configSGD("L2RegWeight", "0");
+            ElemType L1RegWeight = (ElemType)configSGD("L1RegWeight", "0");
 
             /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of 
             /// useAdagrad=true
@@ -213,7 +214,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
                 maxTempMemSizeInSamplesForCNN, gUpdateInfo, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
                 trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
-                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, needAveMultiplier, L2RegWeight);
+                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, needAveMultiplier, L2RegWeight, L1RegWeight);
         }
     
         void setMomentum(float momentum)
@@ -235,7 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
             const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
             const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
-            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0)
+            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0, const ElemType L1RegWeight = 0)
         {
             m_numPrevLearnRates = numPrevLearnRates;
             m_mbSize=mbSize;
@@ -276,6 +277,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             m_needAveMultiplier = needAveMultiplier;
             m_L2RegWeight = L2RegWeight;
+            m_L1RegWeight = L1RegWeight;
 
             for (size_t i=0; i<m_mbSize.size(); i++)
                 if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
@@ -1037,7 +1039,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         ComputationNodePtr node = (*nodeIter);
                         Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
 
-                        UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_needAveMultiplier);
+                        UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
                     }                    
                 }
 
@@ -1103,7 +1105,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 public:
         // UpdateWeightsS - static version of UpdateWeights()
-    static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const bool needAveMultiplier)
+    static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const ElemType L1RegWeight, const bool needAveMultiplier)
         {
 #if DUMPOUTPUT
             fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize);
@@ -1128,8 +1130,8 @@ public:
             }
 
             // L2 regularizer
-            if (L2RegWeight > 0)
-                Matrix<ElemType>::ScaleAndAdd(L2RegWeight, functionValues, gradientValues);
+            if (L2RegWeight > 0) //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+                Matrix<ElemType>::ScaleAndAdd(L2RegWeight*actualMBSize, functionValues, gradientValues);
 
             if (adpType == GradientsUpdateType::None)
             {
@@ -1155,18 +1157,23 @@ public:
             {
                 Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
             }
+
+            // L1 regularizer with proximal gradient descent method
+            if (L1RegWeight > 0) //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+                functionValues.InplaceSoftThreshold(learnRatePerSample*L1RegWeight*actualMBSize);
+
 #if DUMPOUTPUT
             functionValues.Print("Parameter Update");
 #endif
         }
 protected:
         // UpdateWeights - update the weights in 
-    void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const bool needAveMultiplier) const
+    void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const ElemType L1RegWeight, const bool needAveMultiplier) const
         {
 #if DUMPOUTPUT
             fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
 #endif
-            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize, L2RegWeight, needAveMultiplier);
+            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize, L2RegWeight, L1RegWeight, needAveMultiplier);
             node->UpdateEvalTimeStamp();
         }
 
@@ -1357,9 +1364,6 @@ protected:
                     irow = max(0, irow);
                     icol = max(0, icol);
 
-                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
-                        continue;
-
                     fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
                     // node->FunctionValues().Print();
                     ElemType eOrg = node->FunctionValues()(irow, icol);
@@ -1368,6 +1372,10 @@ protected:
 
                     node->UpdateEvalTimeStamp();
                     net.ComputeGradient(criterionNodes[npos]);  //use only the first criterion. Is 
+//                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE && node->GradientValues().GetDeviceId() != CPUDEVICE)
+                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
+                        break;
+
                     //ElemType mbEvalCri =
                     criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
                     ElemType eGradErr = node->GradientValues()(irow, icol);
@@ -1473,6 +1481,7 @@ protected:
 
         bool m_needAveMultiplier;
         ElemType m_L2RegWeight;
+        ElemType m_L1RegWeight;
     };
     template class SGD<float>; 
     template class SGD<double>;
diff --git a/MachineLearning/cn/SimpleNetworkBuilder.cpp b/MachineLearning/cn/SimpleNetworkBuilder.cpp
index 19b09d52e..6a4a2293c 100644
--- a/MachineLearning/cn/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/cn/SimpleNetworkBuilder.cpp
@@ -45,7 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (numHiddenLayers > 0)
                 {
                     //TODO: to figure out sparse matrix size
-                    u = m_net->CreateSparseLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0], 0);
+                    u = m_net->CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
                     m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
 
                     if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
@@ -76,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     for (int i=1; i<numHiddenLayers; i++)
                     {
                         //TODO: to figure out sparse matrix size
-                        u = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"U%d", i), m_layerSizes[i+1], m_layerSizes[i], 0);
+                        u = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"U%d", i), m_layerSizes[i+1], m_layerSizes[i]);
                         m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
 
                         if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i+1)
@@ -227,7 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 clsweight = m_net->CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
                 m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
-                clslogpostprob = m_net->LogSoftmax(m_net->Times(clsweight, input), L"ClassPostProb");
+                clslogpostprob = m_net->Times(clsweight, input, L"ClassPostProb");
 
                 output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy", 
                     clslogpostprob);
@@ -770,7 +770,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 
     template<class ElemType>
-    ComputationNode<ElemType>* SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs, bool inputWeightSparse)
+    ComputationNode<ElemType>* SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
     {
 
         size_t numHiddenLayers = m_layerSizes.size()-2;
@@ -784,20 +784,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ComputationNodePtr bit=nullptr, bft=nullptr, bct=nullptr;
 
         input = inputObs;
-        if(inputWeightSparse)
-        {
-            Wxo = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);
-            Wxi = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
-            Wxf = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
-            Wxc = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
-        }
-        else
-        {
-            Wxo = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);        
-            Wxi = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
-            Wxf = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
-            Wxc = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
-        }
+        Wxo = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);        
+        Wxi = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
+        Wxf = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
+        Wxc = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
+
         m_net->InitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
         m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
         m_net->InitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
@@ -1082,10 +1073,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 ComputationNodePtr input=nullptr, w=nullptr, b=nullptr, u=nullptr, e=nullptr, delay = nullptr, output=nullptr, label=nullptr, prior=nullptr;
                 ComputationNodePtr Wxo = nullptr, Who=nullptr, Wco=nullptr, bo = nullptr, Wxi=nullptr, Whi=nullptr, Wci=nullptr, bi=nullptr;
-                ComputationNodePtr Wxf=nullptr, Whf=nullptr, Wcf=nullptr, bf=nullptr, Wxc=nullptr, Whc=nullptr, bc=nullptr;
-                ComputationNodePtr ot=nullptr, it=nullptr, ft=nullptr, ct=nullptr, ht=nullptr;
-                ComputationNodePtr delayHI = nullptr, delayCI = nullptr, delayHO = nullptr, delayHF = nullptr, delayHC=nullptr, delayCF=nullptr, delayCC=nullptr;
-                ComputationNodePtr directWIO = nullptr, directInput=nullptr, directOutput=nullptr;
+                ComputationNodePtr clslogpostprob = nullptr;
+                ComputationNodePtr clsweight = nullptr;
                 ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = {nullptr}; 
 
                 input = m_net->CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
@@ -1120,7 +1109,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 int offset = m_lookupTableOrder > 0? 1 : 0;
                 if (numHiddenLayers > 0)
                 {
-                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset]*(offset?m_lookupTableOrder:1), m_layerSizes[offset+1], input, true);
+                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset]*(offset?m_lookupTableOrder:1), m_layerSizes[offset+1], input);
                     input = output;
                     outputFromEachLayer[offset+1]  = input;
 
@@ -1157,20 +1146,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         input = output;
                 }
 
-                // TODO: verify the change is okay
-                // w = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers]*(MAX_WORDS_PER_CLASS+MAX_CLASSES)*mbSize*NUM_UTTS_IN_RECURRENT_ITER);
-                w = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
+                /// need to have [input_dim x output_dim] matrix
+                /// e.g., [200 x 10000], where 10000 is the vocabulary size
+                /// this is for speed-up issue as per word matrix can be simply obtained using column slice
+                w = m_net->CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
                 m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-                // TODO: verify the change is okay
-                //label = m_net->CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize, 2*mbSize*NUM_UTTS_IN_RECURRENT_ITER);
-                label = m_net->CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
-                
-                AddTrainAndEvalCriterionNodes(input, label, w);
-                
-                output = m_net->Times(w, input, L"outputs");   
-                
+
+                /// the label is a dense matrix. each element is the word index
+                label = m_net->CreateInputNode(L"labels", 4, mbSize);
+
+                clsweight = m_net->CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
+                m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
+                clslogpostprob = m_net->Times(clsweight, input, L"ClassPostProb");
+
+                output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
+                    clslogpostprob);
+
                 m_net->OutputNodes().push_back(output);
 
+
                 if (m_needPrior)
                 {
                     prior = m_net->Mean(label);
diff --git a/MachineLearning/cn/SimpleNetworkBuilder.h b/MachineLearning/cn/SimpleNetworkBuilder.h
index bdf97a2f1..ec0d61ca7 100644
--- a/MachineLearning/cn/SimpleNetworkBuilder.h
+++ b/MachineLearning/cn/SimpleNetworkBuilder.h
@@ -330,7 +330,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         ComputationNetwork<ElemType>& BuildClassEntropyNetwork(size_t mbSize = 1);
 
-        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, bool inputWeightSparse = false);
+        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
 
         ComputationNode<ElemType>* BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
 
diff --git a/MachineLearning/cn/TrainingCriterionNode.h b/MachineLearning/cn/TrainingCriterionNode.h
index 217a9a233..8284529e5 100644
--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
@@ -1041,6 +1041,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             clsLogSoftmax.Resize(input_cls_log_post_prob.GetNumRows(), nT);
             clsSoftmax.Resize(input_cls_log_post_prob.GetNumRows(), nT);
 
+            clsLogSoftmax = input_cls_log_post_prob;
+            clsLogSoftmax.InplaceLogSoftmax(true); /// 50 x nT
+            clsSoftmax.AssignExpOf(clsLogSoftmax);
+
             /// loop over time
             functionValues.SetValue(0);
             sz = 0;
@@ -1075,12 +1079,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Matrix<ElemType>::AddElementToElement(logSoftMax_t, 0, idx_in_class, functionValues, 0, 0);
 
                 /// add the class log posterior probability
-                Matrix<ElemType> clsLogSoftmax_t = clsLogSoftmax.ColumnSlice(t, 1);
-                clsLogSoftmax_t.SetValue(input_cls_log_post_prob.ColumnSlice(t, 1));
-                clsLogSoftmax_t.InplaceLogSoftmax(true); /// 50 x 1
-                Matrix<ElemType> clsSoftmax_t = clsSoftmax.ColumnSlice(t, 1);
-                clsSoftmax_t.AssignExpOf(clsLogSoftmax_t);
-                Matrix<ElemType>::AddElementToElement(clsLogSoftmax_t, c_t, 0, functionValues, 0, 0);
+                Matrix<ElemType>::AddElementToElement(clsLogSoftmax, c_t, t, functionValues, 0, 0);
 
                 sz += nbr_wrd;
             }
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 2fe2083bc..7c953051d 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -2296,7 +2296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
     {
         if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
 
         auto& us=*this;
         ElemType locThresholdPos = abs(threshold);
@@ -2342,6 +2342,60 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
+    //x= x-threshold if x>threshold, x+threshold if x<-threshold, 0 otherwise
+    template<class ElemType>
+    CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
+
+        long m = (long)GetNumElements();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (m_pArray[i] > threshold)
+                m_pArray[i] -= threshold;
+            else if (m_pArray[i] < -threshold)
+                m_pArray[i] += threshold;
+            else
+                m_pArray[i] = 0;
+
+            if (m_pArray[i+1] > threshold)
+                m_pArray[i+1] -= threshold;
+            else if (m_pArray[i+1] < -threshold)
+                m_pArray[i+1] += threshold;
+            else
+                m_pArray[i+1] = 0;
+
+            if (m_pArray[i+2] > threshold)
+                m_pArray[i+2] -= threshold;
+            else if (m_pArray[i+2] < -threshold)
+                m_pArray[i+2] += threshold;
+            else
+                m_pArray[i+2] = 0;
+
+            if (m_pArray[i+3] > threshold)
+                m_pArray[i+3] -= threshold;
+            else if (m_pArray[i+3] < -threshold)
+                m_pArray[i+3] += threshold;
+            else
+                m_pArray[i+3] = 0;
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (m_pArray[i] > threshold)
+                m_pArray[i] -= threshold;
+            else if (m_pArray[i] < -threshold)
+                m_pArray[i] += threshold;
+            else
+                m_pArray[i] = 0;
+        }
+
+        return *this;
+    }
+
     //Threshold truncating: this[i] = max( a[i], threshold )
     template<class ElemType>
     CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignTruncateBottomOf (const CPUMatrix<ElemType>& a, const ElemType threshold)
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index 6141b828e..7f4ada76c 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -197,6 +197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
         CPUMatrix<ElemType>& AssignTruncateTopOf (const CPUMatrix<ElemType>& a, const ElemType threshold);
         CPUMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
         CPUMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
 
diff --git a/Math/Math/CPUSparseMatrix.cpp b/Math/Math/CPUSparseMatrix.cpp
index 3b2139138..91438b7e2 100644
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@@ -25,6 +25,7 @@
 #include "basetypes.h"
 #include "fileutil.h"
 
+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
 
 #ifndef USE_MKL
 // use ACML as default. 
@@ -703,38 +704,244 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return 1;
     }
 
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncateTop(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] = threshold;
+
+            if (nzValues[i+1] > threshold)
+                nzValues[i+1] = threshold;
+
+            if (nzValues[i+2] > threshold)
+                nzValues[i+2] = threshold;
+
+            if (nzValues[i+3] > threshold)
+                nzValues[i+3] = threshold;
+
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] = threshold;
+        }
+
+        return *this;
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] < threshold)
+                nzValues[i] = threshold;
+
+            if (nzValues[i + 1] < threshold)
+                nzValues[i + 1] = threshold;
+
+            if (nzValues[i + 2] < threshold)
+                nzValues[i + 2] = threshold;
+
+            if (nzValues[i + 3] < threshold)
+                nzValues[i + 3] = threshold;
+
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] < threshold)
+                nzValues[i] = threshold;
+        }
+
+        return *this;
+    }
+
     template<class ElemType>
     CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
     {
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            ElemType locThresholdPos = abs(threshold);
-            ElemType locTHresholdNeg = -locThresholdPos; 
+        ElemType locThresholdPos = abs(threshold);
+        ElemType locTHresholdNeg = -locThresholdPos; 
 
-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for (size_t p = start; p < start+len; p++)
-                {
-                    if (m_pArray[p] > locThresholdPos)
-                    {
-                        m_pArray[p] = locThresholdPos;
-                    }
-                    else if (m_pArray[p] < locTHresholdNeg)
-                    {
-                        m_pArray[p] = locTHresholdNeg;
-                    }
-                }
-            }
-        } 
-        else 
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
         {
-            throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix");
+            if (nzValues[i] > locThresholdPos)
+                nzValues[i] = locThresholdPos;
+            else if (nzValues[i] < locTHresholdNeg)
+                nzValues[i] = locTHresholdNeg;
+
+            if (nzValues[i+1] > locThresholdPos)
+                nzValues[i+1] = locThresholdPos;
+            else if (nzValues[i+1] < locTHresholdNeg)
+                nzValues[i+1] = locTHresholdNeg;
+
+            if (nzValues[i+2] > locThresholdPos)
+                nzValues[i+2] = locThresholdPos;
+            else if (nzValues[i+2] < locTHresholdNeg)
+                nzValues[i+2] = locTHresholdNeg;
+
+            if (nzValues[i+3] > locThresholdPos)
+                nzValues[i+3] = locThresholdPos;
+            else if (nzValues[i+3] < locTHresholdNeg)
+                nzValues[i+3] = locTHresholdNeg;
         }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > locThresholdPos)
+                nzValues[i] = locThresholdPos;
+            else if (nzValues[i] < locTHresholdNeg)
+                nzValues[i] = locTHresholdNeg;
+        }
+
         return *this;
     }    
 
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] -= threshold;
+            else if (nzValues[i] < -threshold)
+                nzValues[i] += threshold;
+            else
+                nzValues[i] = 0;
+
+            if (nzValues[i + 1] > threshold)
+                nzValues[i + 1] -= threshold;
+            else if (nzValues[i + 1] < -threshold)
+                nzValues[i + 1] += threshold;
+            else
+                nzValues[i + 1] = 0;
+
+            if (nzValues[i + 2] > threshold)
+                nzValues[i + 2] -= threshold;
+            else if (nzValues[i + 2] < -threshold)
+                nzValues[i + 2] += threshold;
+            else
+                nzValues[i + 2] = 0;
+
+            if (nzValues[i + 3] > threshold)
+                nzValues[i + 3] -= threshold;
+            else if (nzValues[i + 3] < -threshold)
+                nzValues[i + 3] += threshold;
+            else
+                nzValues[i + 3] = 0;
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] -= threshold;
+            else if (nzValues[i] < -threshold)
+                nzValues[i] += threshold;
+            else
+                nzValues[i] = 0;
+        }
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
+
+        ElemType v = 0;
+
+        long m = (long)NzCount();
+        const ElemType *nzValues = NzValues();
+
+        //four-way unrolling
+#pragma omp parallel for reduction(+:v)
+        for (long i = 0; i<(m & ~3); i += 4)
+        {
+            v += nzValues[i] * nzValues[i] + nzValues[i + 1] * nzValues[i + 1] + nzValues[i + 2] * nzValues[i + 2] + nzValues[i + 3] * nzValues[i + 3];
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            v += nzValues[i] * nzValues[i];
+        }
+
+        return sqrt(v);
+    }
+
+    //sum of all abs(elements)
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfAbsElements: Matrix is empty.");
+
+        if (sizeof(ElemType) == sizeof(double))
+        {
+#ifndef USE_MKL
+            return (ElemType)dasum((int)NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+#else  
+            return (ElemType)cblas_dasum((int)NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+#endif
+        }
+        else
+        {
+#pragma warning (suppress: 4244)
+#ifndef USE_MKL
+            return sasum((int)NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+#else
+            return cblas_sasum((int)NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+#endif
+        }
+    }
+
+
+    //sum of all elements
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfElements: Matrix is empty.");
+
+        ElemType sum = 0;
+
+        long m = (long)NzCount();
+        const ElemType *nzValues = NzValues();
+
+        //four-way unrolling
+#pragma omp parallel for reduction(+:sum)
+        for (long i = 0; i<(m & ~3); i += 4)
+        {
+            sum += nzValues[i] + nzValues[i + 1] + nzValues[i + 2] + nzValues[i + 3];
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            sum += nzValues[i];
+        }
+
+        return sum;
+    }
+
     template <class ElemType>
     MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
     {
diff --git a/Math/Math/CPUSparseMatrix.h b/Math/Math/CPUSparseMatrix.h
index 8e2b6589e..d0b2e9ff5 100644
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@@ -108,10 +108,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
         ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);
 
-        public:
-        CPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType /*threshold*/) { NOT_IMPLEMENTED; }
-        CPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType /*threshold*/) { NOT_IMPLEMENTED; }
-        CPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType /*threshold*/);
+    public:
+        CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
+
+        ElemType FrobeniusNorm() const; //useful for comparing CPU and GPU results
+
+        ElemType SumOfAbsElements() const; //sum of all abs(elements)
+        ElemType SumOfElements() const; //sum of all elements
 
     public:
         //void Print(const char* /*matrixName*/) const { NOT_IMPLEMENTED; }
diff --git a/Math/Math/CommonMatrix.h b/Math/Math/CommonMatrix.h
index 724cf5a89..81c35d19e 100644
--- a/Math/Math/CommonMatrix.h
+++ b/Math/Math/CommonMatrix.h
@@ -11,10 +11,10 @@
 #define AUTOPLACEMATRIX 1000 // used in parameters only
 #define MANAGEDEXTERN -2 // managed externally (i.e. PTask)
 #define CPUDEVICE -1 // device is the CPU
-#define EPS_IN_INVERSE 1e-30f  // min float is 1.4e-45 and max float is 3.4e-38
-#define EPS_IN_LOG 1e-40f
-#define LOG_OF_EPS_IN_LOG -92.1f // log(EPS_IN_LOG)
-#define LOG10_OF_EPS_IN_LOG -40 // log_10(EPS_IN_LOG)
+#define EPS_IN_INVERSE 1e-30f  // 1e-37 is the only guaranteed precision
+#define EPS_IN_LOG 1e-37f  // 1e-37 is the only guaranteed precision
+#define LOG_OF_EPS_IN_LOG -85.1f // log(EPS_IN_LOG)
+#define LOG10_OF_EPS_IN_LOG -37 // log_10(EPS_IN_LOG)
 #define LZERO  -10e10
 #define MINLOGEXP -9.2103
 #define LSMALL -0.5E10
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 432b40913..8b671c63a 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -2003,6 +2003,42 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         return *this;        
     }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncate(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(N*1.0 / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _inplaceTruncate<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, threshold, N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceSoftThreshold: Matrix is empty.");
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(N*1.0 / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _inplaceSoftThreshold<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, threshold, N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
     template<class ElemType>
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
     {
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index 20682abc2..c4fe537b6 100644
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -218,6 +218,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         GPUMatrix<ElemType>& AssignTruncateBottomOf (const GPUMatrix<ElemType>& a, const ElemType threshold);
         GPUMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
         GPUMatrix<ElemType>& AssignTruncateTopOf (const GPUMatrix<ElemType>& a, const ElemType threshold);
+        GPUMatrix<ElemType>& InplaceTruncate(const ElemType threshold);
+        GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
         GPUMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
 
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 933822626..2698fd4a6 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -2784,6 +2784,29 @@ __global__ void _inplaceTruncate(
     }
 }
 
+template<class ElemType>
+__global__ void _inplaceSoftThreshold(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    if (a[id] > threshold)
+    {
+        a[id] -= threshold;
+    }
+    else if (a[id] < -threshold)
+    {
+        a[id] += threshold;
+    }
+    else
+        a[id] = 0;
+}
+
+
 template<class ElemType>
 __global__ void _normalGradForSparseBlock(
     const ElemType momentum,
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index 627bf547b..2e5d2f1a4 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -1108,26 +1108,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
     {
-        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow ||
-            m_format == matrixFormatSparseCSR || m_format == matrixFormatSparseCSC)
-        {
-            long N=(long)GetNumNZElements();
-            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            ElemType * values = NzValues();
-            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        } 
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
+        long N=(long)GetNumNZElements();
+
+        long blocksPerGrid = (long)ceil(N*1.0 / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        ElemType * values = NzValues();
+        _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+
         return *this;
     } 
 
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        long N = (long)GetNumNZElements();
+
+        long blocksPerGrid = (long)ceil(N*1.0 / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        ElemType * values = NzValues();
+        _inplaceSoftThreshold<ElemType> << <blocksPerGrid, threadsPerBlock >> >(values, threshold, N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
     // normal update for smoothed gradients c and current gradients (this)
     template<class ElemType> 
     void GPUSparseMatrix<ElemType>::NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum)
diff --git a/Math/Math/GPUSparseMatrix.h b/Math/Math/GPUSparseMatrix.h
index e0b2e4dcc..2edb5679e 100644
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@@ -232,6 +232,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
 
         GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
         GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
         GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index bc02191da..67c8bf170 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -2440,7 +2440,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     Matrix<ElemType>& Matrix<ElemType>::InplaceTruncate(const ElemType threshold)
     {
         if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
 
         if (sizeof(ElemType)==sizeof(float))
         {
@@ -2456,7 +2456,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
             this->m_CPUMatrix->InplaceTruncate(threshold), 
-            this->m_GPUMatrix->InplaceTruncateTop(fabs(threshold)); this->m_GPUMatrix->InplaceTruncateBottom(-fabs(threshold)), 
+            this->m_GPUMatrix->InplaceTruncate(threshold),
             this->m_CPUSparseMatrix->InplaceTruncate(threshold),
             this->m_GPUSparseMatrix->InplaceTruncate(threshold)
             );
@@ -2464,6 +2464,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        assert(threshold >= 0);
+
+        if (IsEmpty())
+            throw std::logic_error("InplaceSoftThreshold: Matrix is empty.");
+        
+        if (threshold == 0)
+            return *this;
+
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->InplaceSoftThreshold(threshold),
+            this->m_GPUMatrix->InplaceSoftThreshold(threshold),
+            this->m_CPUSparseMatrix->InplaceSoftThreshold(threshold),
+            this->m_GPUSparseMatrix->InplaceSoftThreshold(threshold)
+            );
+
+        return *this;
+    }
     //Threshold truncating: this[i] = max( this[i], threshold )
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
@@ -2486,7 +2507,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             this,
             this->m_CPUMatrix->InplaceTruncateBottom(threshold), 
             this->m_GPUMatrix->InplaceTruncateBottom(threshold), 
-            NOT_IMPLEMENTED, 
+            this->m_CPUSparseMatrix->InplaceTruncateBottom(threshold),
             this->m_GPUSparseMatrix->InplaceTruncateBottom(threshold)
             );
                 
@@ -2542,18 +2563,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (!isfinite((float)threshold))
                 return *this;
-            }
+        }
         else
         {
             if (!isfinite(threshold))
                 return *this;
-            }
+        }
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
             this->m_CPUMatrix->InplaceTruncateTop(threshold), 
             this->m_GPUMatrix->InplaceTruncateTop(threshold), 
-            NOT_IMPLEMENTED, 
+            this->m_CPUSparseMatrix->InplaceTruncateTop(threshold),
             this->m_GPUSparseMatrix->InplaceTruncateTop(threshold)
             );
                 
@@ -2626,7 +2647,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             nullptr,
             return this->m_CPUMatrix->SumOfElements(), 
             return this->m_GPUMatrix->SumOfElements(), 
-            NOT_IMPLEMENTED, 
+            return this->m_CPUSparseMatrix->SumOfElements(),
             return this->m_GPUSparseMatrix->SumOfElements()
             );
                 
@@ -2869,7 +2890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             nullptr,
             return this->m_CPUMatrix->FrobeniusNorm(), 
             return this->m_GPUMatrix->FrobeniusNorm(), 
-            NOT_IMPLEMENTED, 
+            return this->m_CPUSparseMatrix->FrobeniusNorm(),
             return this->m_GPUSparseMatrix->FrobeniusNorm()
             );                
         }
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 68a3bf021..552ae4510 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -236,6 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
         Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
         Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        Matrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
         Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);