Merge branch 'master' of https://git01.codeplex.com/cntk

2015-02-24 17:49:46 -08:00 · 2015-02-24 17:49:46 -08:00 · 676b894daa
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
@ -1777,8 +1777,8 @@ bool BatchSequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<E
            // vector of feature data goes into matrix column
            size_t idx = (size_t)m_featureData[j];

-            //if (matrices.find(m_featuresName) != matrices.end())
-                features.SetValue(idx, j, (ElemType)1);
+            features.SetValue(idx, j, (ElemType)1);
+            SetSentenceBegin(idx, j, actualmbsize);
        }
        
        features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false);
@ -1947,7 +1947,6 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
            labels->SetValue(0, j, (ElemType)wrd); 

            SetSentenceEnd(wrd, j, actualmbsize);
-            SetSentenceBegin(wrd, j, actualmbsize);

            if (class_size > 0)
            {
--- a/ExampleSetups/LM/LSTMLM/global.config
+++ b/ExampleSetups/LM/LSTMLM/global.config
@ -0,0 +1,2 @@
+ExpDir=C:\CNTKExp\LSTMLM\log\
+DataDir=C:\CNTKExp\RNN\data\PennTreeBank
--- a/ExampleSetups/LM/LSTMLM/lstmlm.gpu.config
+++ b/ExampleSetups/LM/LSTMLM/lstmlm.gpu.config
@ -0,0 +1,414 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+
+# command=dumpNodeInfo
+#command=train
+#command=test
+command=train:test
+#command=writeWordAndClassInfo
+
+type=double
+
+writeWordAndClassInfo=[
+    action=writeWordAndClass
+    inputFile=$DataFolder$\vocab.txt
+    outputWord2Cls=$ExpFolder$\word2cls.txt
+    outputCls2Index=$ExpFolder$\cls2idx.txt
+    vocabSize=10000
+    nbrClass=50
+    printValues=true
+]
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=Auto
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=Auto
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
--- a/ExampleSetups/LM/RNNLM/CPU/fnnlm.config
+++ b/ExampleSetups/LM/RNNLM/CPU/fnnlm.config
@ -0,0 +1,413 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+# command=dumpNodeInfo
+command=train
+#command=test
+#command=train:test
+#command=writeWordAndClassInfo
+
+type=double
+
+writeWordAndClassInfo=[
+    action=writeWordAndClass
+    inputFile=$DataFolder$\vocab.txt
+    outputWord2Cls=$ExpFolder$\word2cls.txt
+    outputCls2Index=$ExpFolder$\cls2idx.txt
+    vocabSize=10000
+    nbrClass=50
+    printValues=true
+]
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+    # rnnType=LSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
--- a/ExampleSetups/LM/RNNLM/GPU/fnnlm.config
+++ b/ExampleSetups/LM/RNNLM/GPU/fnnlm.config
@ -0,0 +1,403 @@
+# configuration file for class based RNN training
+
+ExpFolder=$ExpDir$
+ConfigFolder=$ConfigDir$
+DataFolder=$DataDir$
+
+stderr=$ExpFolder$
+# command=dumpNodeInfo
+# command=train
+# command=test
+command=train:test
+
+type=double
+
+dumpNodeInfo=[
+    action=dumpnode
+    modelPath=$ExpFolder$\modelRnnCNTK
+    #nodeName=W0
+    printValues=true
+]
+
+devtest=[action=devtest]
+
+train=[
+    action=trainRNN
+    minibatchSize=10
+    traceLevel=1
+    deviceId=0
+    epochSize=4430000
+    # which is 886 * 5000
+#    recurrentLayer=1
+    defaultHiddenActivity=0.0
+    useValidation=true
+    rnnType=CLASSLM
+    # rnnType=LSTM
+
+    # uncomment below and comment SimpleNetworkBuilder section to use NDL to train RNN LM
+    # NDLNetworkBuilder=[
+    #    networkDescription=$ConfigFolder$\rnnlm.ndl
+    # ]
+
+    SimpleNetworkBuilder=[
+        trainingCriterion=classcrossentropywithsoftmax
+        evalCriterion=classcrossentropywithsoftmax
+        nodeType=Sigmoid
+        initValueScale=6.0
+        layerSizes=10000:200:10000
+        addPrior=false
+        addDropoutNodes=false
+        applyMeanVarNorm=false
+        uniformInit=true;
+
+        # these are for the class information for class-based language modeling
+        vocabSize=10000
+        nbrClass=50
+    ]
+
+    # configuration file, base parameters
+    SGD=[
+        learningRatesPerSample=0.1
+        momentumPerMB=0
+        gradientClippingWithTruncation=true
+        clippingThresholdPerSample=15.0
+        maxEpochs=40
+        unroll=false
+        numMBsToShowResult=2000
+        # gradUpdateType=AdaGrad
+        gradUpdateType=None
+      
+        modelPath=$ExpFolder$\modelRnnCNTK
+        loadBestModel=true
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust=[
+            # auto learning rate adjustment
+            autoAdjustLR=adjustafterepoch
+            reduceLearnRateIfImproveLessThan=0.001
+            continueReduce=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            numMiniBatch4LRSearch=100
+            numPrevLearnRates=5
+            numBestSearchEpoch=1
+        ]
+
+        dropoutRate=0.0
+    ]
+
+    reader=[
+      readerType=SequenceReader
+      randomize=None
+      nbruttsineachrecurrentiter=1
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.train.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+
+    cvReader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.valid.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      #windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.valid.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      # it should be the same as that in the training set
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
+
+
+test=[
+    action=eval
+
+    # correspond to the number of words/characteres to train in a minibatch
+    minibatchSize=1
+    # need to be small since models are updated for each minibatch
+    traceLevel=1
+    deviceId=-1
+    epochSize=4430000
+    # which is 886 * 5000
+    recurrentLayer=1
+    defaultHiddenActivity=0.1
+    useValidation=true
+    rnnType=CLASSLM
+
+    modelPath=$ExpFolder$\modelRnnCNTK
+
+    reader=[
+      # reader to use
+      readerType=SequenceReader
+      randomize=None
+
+      # word class info
+      wordclass=$DataFolder$\vocab.txt
+
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      # writerType=BinaryReader
+
+      #### write definition
+      wfile=$ExpFolder$\sequenceSentence.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      # wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+      wrecords=1000
+      # windowSize - number of records we should include in BinaryWriter window
+      windowSize=10000
+
+      file=$DataFolder$\ptb.test.cntk.txt
+
+      #additional features sections
+      #for now store as expanded category data (including label in)
+      features=[
+        # sentence has no features, so need to set dimension to zero
+        dim=0
+        ### write definition
+        sectionType=data
+      ]
+      # sequence break table, list indexes into sequence records, so we know when a sequence starts/stops
+      sequence=[
+        dim=1
+        wrecords=2
+        ### write definition
+        sectionType=data
+      ]
+      #labels sections
+      labelIn=[
+        dim=1
+
+        # vocabulary size
+        labelDim=10000
+        labelMappingFile=$ExpFolder$\sentenceLabels.txt
+        labelType=Category
+        beginSequence="</s>"
+        endSequence="</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+      #labels sections
+      labels=[
+        dim=1
+        labelType=NextWord
+        beginSequence="O"
+        endSequence="O"
+
+        # vocabulary size
+        labelDim=10000
+
+        labelMappingFile=$ExpFolder$\sentenceLabels.out.txt
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=3
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=3
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
+]
--- a/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
+++ b/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
@ -28,7 +28,7 @@ train=[
    deviceId=0
    epochSize=4430000
    # which is 886 * 5000
-#    recurrentLayer=1
+    recurrentLayer=1
    defaultHiddenActivity=0.0
    useValidation=true
    rnnType=CLASSLM
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
@ -184,8 +184,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            rpi.max = (double)configSGD("rms_wgt_max", "10.0");
            rpi.gamma = (double)configSGD("rms_gamma", "0.99");

-            bool needAveMultiplier = (bool)configSGD("needAveMultiplier", "true");
+            bool needAveMultiplier = (bool)configSGD("normWithAveMultiplier", "true");
            ElemType L2RegWeight = (ElemType)configSGD("L2RegWeight", "0");
+            ElemType L1RegWeight = (ElemType)configSGD("L1RegWeight", "0");

            /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of 
            /// useAdagrad=true
@ -213,7 +214,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
                maxTempMemSizeInSamplesForCNN, gUpdateInfo, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
                trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
-                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, needAveMultiplier, L2RegWeight);
+                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, needAveMultiplier, L2RegWeight, L1RegWeight);
        }
    
        void setMomentum(float momentum)
@ -235,7 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
            const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
            const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
-            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0)
+            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0, const ElemType L1RegWeight = 0)
        {
            m_numPrevLearnRates = numPrevLearnRates;
            m_mbSize=mbSize;
@ -276,6 +277,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            m_needAveMultiplier = needAveMultiplier;
            m_L2RegWeight = L2RegWeight;
+            m_L1RegWeight = L1RegWeight;

            for (size_t i=0; i<m_mbSize.size(); i++)
                if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
@ -1037,7 +1039,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        ComputationNodePtr node = (*nodeIter);
                        Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);

-                        UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_needAveMultiplier);
+                        UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
                    }                    
                }

@ -1103,7 +1105,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
 public:
        // UpdateWeightsS - static version of UpdateWeights()
-    static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const bool needAveMultiplier)
+    static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const ElemType L1RegWeight, const bool needAveMultiplier)
        {
 #if DUMPOUTPUT
            fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize);
@ -1128,8 +1130,8 @@ public:
            }

            // L2 regularizer
-            if (L2RegWeight > 0)
-                Matrix<ElemType>::ScaleAndAdd(L2RegWeight, functionValues, gradientValues);
+            if (L2RegWeight > 0) //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+                Matrix<ElemType>::ScaleAndAdd(L2RegWeight*actualMBSize, functionValues, gradientValues);

            if (adpType == GradientsUpdateType::None)
            {
@ -1155,18 +1157,23 @@ public:
            {
                Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
            }
+
+            // L1 regularizer with proximal gradient descent method
+            if (L1RegWeight > 0) //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+                functionValues.InplaceSoftThreshold(learnRatePerSample*L1RegWeight*actualMBSize);
+
 #if DUMPOUTPUT
            functionValues.Print("Parameter Update");
 #endif
        }
 protected:
        // UpdateWeights - update the weights in 
-    void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const bool needAveMultiplier) const
+    void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize, const ElemType L2RegWeight, const ElemType L1RegWeight, const bool needAveMultiplier) const
        {
 #if DUMPOUTPUT
            fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
 #endif
-            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize, L2RegWeight, needAveMultiplier);
+            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize, L2RegWeight, L1RegWeight, needAveMultiplier);
            node->UpdateEvalTimeStamp();
        }

@ -1357,9 +1364,6 @@ protected:
                    irow = max(0, irow);
                    icol = max(0, icol);

-                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
-                        continue;
-
                    fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
                    // node->FunctionValues().Print();
                    ElemType eOrg = node->FunctionValues()(irow, icol);
@ -1368,6 +1372,10 @@ protected:

                    node->UpdateEvalTimeStamp();
                    net.ComputeGradient(criterionNodes[npos]);  //use only the first criterion. Is 
+//                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE && node->GradientValues().GetDeviceId() != CPUDEVICE)
+                    if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
+                        break;
+
                    //ElemType mbEvalCri =
                    criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
                    ElemType eGradErr = node->GradientValues()(irow, icol);
@ -1473,6 +1481,7 @@ protected:

        bool m_needAveMultiplier;
        ElemType m_L2RegWeight;
+        ElemType m_L1RegWeight;
    };
    template class SGD<float>; 
    template class SGD<double>;
--- a/MachineLearning/cn/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/cn/SimpleNetworkBuilder.cpp
@ -45,7 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (numHiddenLayers > 0)
                {
                    //TODO: to figure out sparse matrix size
-                    u = m_net->CreateSparseLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0], 0);
+                    u = m_net->CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
                    m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

                    if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
@ -76,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    for (int i=1; i<numHiddenLayers; i++)
                    {
                        //TODO: to figure out sparse matrix size
-                        u = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"U%d", i), m_layerSizes[i+1], m_layerSizes[i], 0);
+                        u = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"U%d", i), m_layerSizes[i+1], m_layerSizes[i]);
                        m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

                        if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i+1)
@ -227,7 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                clsweight = m_net->CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
                m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
-                clslogpostprob = m_net->LogSoftmax(m_net->Times(clsweight, input), L"ClassPostProb");
+                clslogpostprob = m_net->Times(clsweight, input, L"ClassPostProb");

                output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy", 
                    clslogpostprob);
@ -770,7 +770,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {


    template<class ElemType>
-    ComputationNode<ElemType>* SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs, bool inputWeightSparse)
+    ComputationNode<ElemType>* SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
    {

        size_t numHiddenLayers = m_layerSizes.size()-2;
@ -784,20 +784,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ComputationNodePtr bit=nullptr, bft=nullptr, bct=nullptr;

        input = inputObs;
-        if(inputWeightSparse)
-        {
-            Wxo = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);
-            Wxi = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
-            Wxf = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
-            Wxc = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
-        }
-        else
-        {
-            Wxo = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);        
-            Wxi = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
-            Wxf = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
-            Wxc = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
-        }
+        Wxo = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXO%d", iLayer), outputDim, inputDim);        
+        Wxi = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXI%d", iLayer), outputDim, inputDim);
+        Wxf = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXF%d", iLayer), outputDim, inputDim);
+        Wxc = m_net->CreateLearnableParameter(msra::strfun::wstrprintf (L"WXC%d", iLayer), outputDim, inputDim);
+
        m_net->InitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
        m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
        m_net->InitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
@ -1082,10 +1073,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                ComputationNodePtr input=nullptr, w=nullptr, b=nullptr, u=nullptr, e=nullptr, delay = nullptr, output=nullptr, label=nullptr, prior=nullptr;
                ComputationNodePtr Wxo = nullptr, Who=nullptr, Wco=nullptr, bo = nullptr, Wxi=nullptr, Whi=nullptr, Wci=nullptr, bi=nullptr;
-                ComputationNodePtr Wxf=nullptr, Whf=nullptr, Wcf=nullptr, bf=nullptr, Wxc=nullptr, Whc=nullptr, bc=nullptr;
-                ComputationNodePtr ot=nullptr, it=nullptr, ft=nullptr, ct=nullptr, ht=nullptr;
-                ComputationNodePtr delayHI = nullptr, delayCI = nullptr, delayHO = nullptr, delayHF = nullptr, delayHC=nullptr, delayCF=nullptr, delayCC=nullptr;
-                ComputationNodePtr directWIO = nullptr, directInput=nullptr, directOutput=nullptr;
+                ComputationNodePtr clslogpostprob = nullptr;
+                ComputationNodePtr clsweight = nullptr;
                ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = {nullptr}; 

                input = m_net->CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
@ -1120,7 +1109,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                int offset = m_lookupTableOrder > 0? 1 : 0;
                if (numHiddenLayers > 0)
                {
-                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset]*(offset?m_lookupTableOrder:1), m_layerSizes[offset+1], input, true);
+                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset]*(offset?m_lookupTableOrder:1), m_layerSizes[offset+1], input);
                    input = output;
                    outputFromEachLayer[offset+1]  = input;

@ -1157,20 +1146,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        input = output;
                }

-                // TODO: verify the change is okay
-                // w = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers]*(MAX_WORDS_PER_CLASS+MAX_CLASSES)*mbSize*NUM_UTTS_IN_RECURRENT_ITER);
-                w = m_net->CreateSparseLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
+                /// need to have [input_dim x output_dim] matrix
+                /// e.g., [200 x 10000], where 10000 is the vocabulary size
+                /// this is for speed-up issue as per word matrix can be simply obtained using column slice
+                w = m_net->CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-                // TODO: verify the change is okay
-                //label = m_net->CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize, 2*mbSize*NUM_UTTS_IN_RECURRENT_ITER);
-                label = m_net->CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
-                
-                AddTrainAndEvalCriterionNodes(input, label, w);
-                
-                output = m_net->Times(w, input, L"outputs");   
-                
+
+                /// the label is a dense matrix. each element is the word index
+                label = m_net->CreateInputNode(L"labels", 4, mbSize);
+
+                clsweight = m_net->CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
+                m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
+                clslogpostprob = m_net->Times(clsweight, input, L"ClassPostProb");
+
+                output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
+                    clslogpostprob);
+
                m_net->OutputNodes().push_back(output);

+
                if (m_needPrior)
                {
                    prior = m_net->Mean(label);
--- a/MachineLearning/cn/SimpleNetworkBuilder.h
+++ b/MachineLearning/cn/SimpleNetworkBuilder.h
@ -330,7 +330,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        ComputationNetwork<ElemType>& BuildClassEntropyNetwork(size_t mbSize = 1);

-        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, bool inputWeightSparse = false);
+        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);

        ComputationNode<ElemType>* BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);

--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
@ -1041,6 +1041,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            clsLogSoftmax.Resize(input_cls_log_post_prob.GetNumRows(), nT);
            clsSoftmax.Resize(input_cls_log_post_prob.GetNumRows(), nT);

+            clsLogSoftmax = input_cls_log_post_prob;
+            clsLogSoftmax.InplaceLogSoftmax(true); /// 50 x nT
+            clsSoftmax.AssignExpOf(clsLogSoftmax);
+
            /// loop over time
            functionValues.SetValue(0);
            sz = 0;
@ -1075,12 +1079,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                Matrix<ElemType>::AddElementToElement(logSoftMax_t, 0, idx_in_class, functionValues, 0, 0);

                /// add the class log posterior probability
-                Matrix<ElemType> clsLogSoftmax_t = clsLogSoftmax.ColumnSlice(t, 1);
-                clsLogSoftmax_t.SetValue(input_cls_log_post_prob.ColumnSlice(t, 1));
-                clsLogSoftmax_t.InplaceLogSoftmax(true); /// 50 x 1
-                Matrix<ElemType> clsSoftmax_t = clsSoftmax.ColumnSlice(t, 1);
-                clsSoftmax_t.AssignExpOf(clsLogSoftmax_t);
-                Matrix<ElemType>::AddElementToElement(clsLogSoftmax_t, c_t, 0, functionValues, 0, 0);
+                Matrix<ElemType>::AddElementToElement(clsLogSoftmax, c_t, t, functionValues, 0, 0);

                sz += nbr_wrd;
            }
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -2296,7 +2296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
    {
        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");

        auto& us=*this;
        ElemType locThresholdPos = abs(threshold);
@ -2342,6 +2342,60 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    //x= x-threshold if x>threshold, x+threshold if x<-threshold, 0 otherwise
+    template<class ElemType>
+    CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
+
+        long m = (long)GetNumElements();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (m_pArray[i] > threshold)
+                m_pArray[i] -= threshold;
+            else if (m_pArray[i] < -threshold)
+                m_pArray[i] += threshold;
+            else
+                m_pArray[i] = 0;
+
+            if (m_pArray[i+1] > threshold)
+                m_pArray[i+1] -= threshold;
+            else if (m_pArray[i+1] < -threshold)
+                m_pArray[i+1] += threshold;
+            else
+                m_pArray[i+1] = 0;
+
+            if (m_pArray[i+2] > threshold)
+                m_pArray[i+2] -= threshold;
+            else if (m_pArray[i+2] < -threshold)
+                m_pArray[i+2] += threshold;
+            else
+                m_pArray[i+2] = 0;
+
+            if (m_pArray[i+3] > threshold)
+                m_pArray[i+3] -= threshold;
+            else if (m_pArray[i+3] < -threshold)
+                m_pArray[i+3] += threshold;
+            else
+                m_pArray[i+3] = 0;
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (m_pArray[i] > threshold)
+                m_pArray[i] -= threshold;
+            else if (m_pArray[i] < -threshold)
+                m_pArray[i] += threshold;
+            else
+                m_pArray[i] = 0;
+        }
+
+        return *this;
+    }
+
    //Threshold truncating: this[i] = max( a[i], threshold )
    template<class ElemType>
    CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignTruncateBottomOf (const CPUMatrix<ElemType>& a, const ElemType threshold)
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -197,6 +197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
        CPUMatrix<ElemType>& AssignTruncateTopOf (const CPUMatrix<ElemType>& a, const ElemType threshold);
        CPUMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

        CPUMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);

--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -25,6 +25,7 @@
 #include "basetypes.h"
 #include "fileutil.h"

+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

 #ifndef USE_MKL
 // use ACML as default. 
@ -703,38 +704,244 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return 1;
    }

+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncateTop(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] = threshold;
+
+            if (nzValues[i+1] > threshold)
+                nzValues[i+1] = threshold;
+
+            if (nzValues[i+2] > threshold)
+                nzValues[i+2] = threshold;
+
+            if (nzValues[i+3] > threshold)
+                nzValues[i+3] = threshold;
+
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] = threshold;
+        }
+
+        return *this;
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] < threshold)
+                nzValues[i] = threshold;
+
+            if (nzValues[i + 1] < threshold)
+                nzValues[i + 1] = threshold;
+
+            if (nzValues[i + 2] < threshold)
+                nzValues[i + 2] = threshold;
+
+            if (nzValues[i + 3] < threshold)
+                nzValues[i + 3] = threshold;
+
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] < threshold)
+                nzValues[i] = threshold;
+        }
+
+        return *this;
+    }
+
    template<class ElemType>
    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
    {
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            ElemType locThresholdPos = abs(threshold);
-            ElemType locTHresholdNeg = -locThresholdPos; 
+        ElemType locThresholdPos = abs(threshold);
+        ElemType locTHresholdNeg = -locThresholdPos; 

-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for (size_t p = start; p < start+len; p++)
-                {
-                    if (m_pArray[p] > locThresholdPos)
-                    {
-                        m_pArray[p] = locThresholdPos;
-                    }
-                    else if (m_pArray[p] < locTHresholdNeg)
-                    {
-                        m_pArray[p] = locTHresholdNeg;
-                    }
-                }
-            }
-        } 
-        else 
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
        {
-            throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix");
+            if (nzValues[i] > locThresholdPos)
+                nzValues[i] = locThresholdPos;
+            else if (nzValues[i] < locTHresholdNeg)
+                nzValues[i] = locTHresholdNeg;
+
+            if (nzValues[i+1] > locThresholdPos)
+                nzValues[i+1] = locThresholdPos;
+            else if (nzValues[i+1] < locTHresholdNeg)
+                nzValues[i+1] = locTHresholdNeg;
+
+            if (nzValues[i+2] > locThresholdPos)
+                nzValues[i+2] = locThresholdPos;
+            else if (nzValues[i+2] < locTHresholdNeg)
+                nzValues[i+2] = locTHresholdNeg;
+
+            if (nzValues[i+3] > locThresholdPos)
+                nzValues[i+3] = locThresholdPos;
+            else if (nzValues[i+3] < locTHresholdNeg)
+                nzValues[i+3] = locTHresholdNeg;
        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > locThresholdPos)
+                nzValues[i] = locThresholdPos;
+            else if (nzValues[i] < locTHresholdNeg)
+                nzValues[i] = locTHresholdNeg;
+        }
+
        return *this;
    }    

+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        long m = (long)NzCount();
+        ElemType *nzValues = NzValues();
+
+#pragma omp parallel for     
+        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] -= threshold;
+            else if (nzValues[i] < -threshold)
+                nzValues[i] += threshold;
+            else
+                nzValues[i] = 0;
+
+            if (nzValues[i + 1] > threshold)
+                nzValues[i + 1] -= threshold;
+            else if (nzValues[i + 1] < -threshold)
+                nzValues[i + 1] += threshold;
+            else
+                nzValues[i + 1] = 0;
+
+            if (nzValues[i + 2] > threshold)
+                nzValues[i + 2] -= threshold;
+            else if (nzValues[i + 2] < -threshold)
+                nzValues[i + 2] += threshold;
+            else
+                nzValues[i + 2] = 0;
+
+            if (nzValues[i + 3] > threshold)
+                nzValues[i + 3] -= threshold;
+            else if (nzValues[i + 3] < -threshold)
+                nzValues[i + 3] += threshold;
+            else
+                nzValues[i + 3] = 0;
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            if (nzValues[i] > threshold)
+                nzValues[i] -= threshold;
+            else if (nzValues[i] < -threshold)
+                nzValues[i] += threshold;
+            else
+                nzValues[i] = 0;
+        }
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
+
+        ElemType v = 0;
+
+        long m = (long)NzCount();
+        const ElemType *nzValues = NzValues();
+
+        //four-way unrolling
+#pragma omp parallel for reduction(+:v)
+        for (long i = 0; i<(m & ~3); i += 4)
+        {
+            v += nzValues[i] * nzValues[i] + nzValues[i + 1] * nzValues[i + 1] + nzValues[i + 2] * nzValues[i + 2] + nzValues[i + 3] * nzValues[i + 3];
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            v += nzValues[i] * nzValues[i];
+        }
+
+        return sqrt(v);
+    }
+
+    //sum of all abs(elements)
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfAbsElements: Matrix is empty.");
+
+        if (sizeof(ElemType) == sizeof(double))
+        {
+#ifndef USE_MKL
+            return (ElemType)dasum((int)NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+#else  
+            return (ElemType)cblas_dasum((int)NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+#endif
+        }
+        else
+        {
+#pragma warning (suppress: 4244)
+#ifndef USE_MKL
+            return sasum((int)NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+#else
+            return cblas_sasum((int)NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+#endif
+        }
+    }
+
+
+    //sum of all elements
+    template<class ElemType>
+    ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfElements: Matrix is empty.");
+
+        ElemType sum = 0;
+
+        long m = (long)NzCount();
+        const ElemType *nzValues = NzValues();
+
+        //four-way unrolling
+#pragma omp parallel for reduction(+:sum)
+        for (long i = 0; i<(m & ~3); i += 4)
+        {
+            sum += nzValues[i] + nzValues[i + 1] + nzValues[i + 2] + nzValues[i + 3];
+        }
+        //handle remaining stuffs
+        for (long i = m & ~3; i<m; i++)
+        {
+            sum += nzValues[i];
+        }
+
+        return sum;
+    }
+
    template <class ElemType>
    MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
    {
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -108,10 +108,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
        ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);

-        public:
-        CPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType /*threshold*/) { NOT_IMPLEMENTED; }
-        CPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType /*threshold*/) { NOT_IMPLEMENTED; }
-        CPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType /*threshold*/);
+    public:
+        CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        CPUSparseMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
+
+        ElemType FrobeniusNorm() const; //useful for comparing CPU and GPU results
+
+        ElemType SumOfAbsElements() const; //sum of all abs(elements)
+        ElemType SumOfElements() const; //sum of all elements

    public:
        //void Print(const char* /*matrixName*/) const { NOT_IMPLEMENTED; }
--- a/Math/Math/CommonMatrix.h
+++ b/Math/Math/CommonMatrix.h
@ -11,10 +11,10 @@
 #define AUTOPLACEMATRIX 1000 // used in parameters only
 #define MANAGEDEXTERN -2 // managed externally (i.e. PTask)
 #define CPUDEVICE -1 // device is the CPU
-#define EPS_IN_INVERSE 1e-30f  // min float is 1.4e-45 and max float is 3.4e-38
-#define EPS_IN_LOG 1e-40f
-#define LOG_OF_EPS_IN_LOG -92.1f // log(EPS_IN_LOG)
-#define LOG10_OF_EPS_IN_LOG -40 // log_10(EPS_IN_LOG)
+#define EPS_IN_INVERSE 1e-30f  // 1e-37 is the only guaranteed precision
+#define EPS_IN_LOG 1e-37f  // 1e-37 is the only guaranteed precision
+#define LOG_OF_EPS_IN_LOG -85.1f // log(EPS_IN_LOG)
+#define LOG10_OF_EPS_IN_LOG -37 // log_10(EPS_IN_LOG)
 #define LZERO  -10e10
 #define MINLOGEXP -9.2103
 #define LSMALL -0.5E10
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -2003,6 +2003,42 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return *this;        
    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncate(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(N*1.0 / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _inplaceTruncate<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, threshold, N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceSoftThreshold: Matrix is empty.");
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(N*1.0 / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _inplaceSoftThreshold<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, threshold, N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
    template<class ElemType>
    GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
    {
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -218,6 +218,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUMatrix<ElemType>& AssignTruncateBottomOf (const GPUMatrix<ElemType>& a, const ElemType threshold);
        GPUMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
        GPUMatrix<ElemType>& AssignTruncateTopOf (const GPUMatrix<ElemType>& a, const ElemType threshold);
+        GPUMatrix<ElemType>& InplaceTruncate(const ElemType threshold);
+        GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

        GPUMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);

--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2784,6 +2784,29 @@ __global__ void _inplaceTruncate(
    }
 }

+template<class ElemType>
+__global__ void _inplaceSoftThreshold(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    if (a[id] > threshold)
+    {
+        a[id] -= threshold;
+    }
+    else if (a[id] < -threshold)
+    {
+        a[id] += threshold;
+    }
+    else
+        a[id] = 0;
+}
+
+
 template<class ElemType>
 __global__ void _normalGradForSparseBlock(
    const ElemType momentum,
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -1108,26 +1108,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
    {
-        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow ||
-            m_format == matrixFormatSparseCSR || m_format == matrixFormatSparseCSC)
-        {
-            long N=(long)GetNumNZElements();
-            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            ElemType * values = NzValues();
-            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        } 
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
+        long N=(long)GetNumNZElements();
+
+        long blocksPerGrid = (long)ceil(N*1.0 / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        ElemType * values = NzValues();
+        _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+
        return *this;
    } 

+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        long N = (long)GetNumNZElements();
+
+        long blocksPerGrid = (long)ceil(N*1.0 / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        ElemType * values = NzValues();
+        _inplaceSoftThreshold<ElemType> << <blocksPerGrid, threadsPerBlock >> >(values, threshold, N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
    // normal update for smoothed gradients c and current gradients (this)
    template<class ElemType> 
    void GPUSparseMatrix<ElemType>::NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum)
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -232,6 +232,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);

        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -2440,7 +2440,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    Matrix<ElemType>& Matrix<ElemType>::InplaceTruncate(const ElemType threshold)
    {
        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
+            throw std::logic_error("InplaceTruncate: Matrix is empty.");

        if (sizeof(ElemType)==sizeof(float))
        {
@ -2456,7 +2456,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        DISPATCH_MATRIX_ON_FLAG(this,
            this,
            this->m_CPUMatrix->InplaceTruncate(threshold), 
-            this->m_GPUMatrix->InplaceTruncateTop(fabs(threshold)); this->m_GPUMatrix->InplaceTruncateBottom(-fabs(threshold)), 
+            this->m_GPUMatrix->InplaceTruncate(threshold),
            this->m_CPUSparseMatrix->InplaceTruncate(threshold),
            this->m_GPUSparseMatrix->InplaceTruncate(threshold)
            );
@ -2464,6 +2464,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
+    {
+        assert(threshold >= 0);
+
+        if (IsEmpty())
+            throw std::logic_error("InplaceSoftThreshold: Matrix is empty.");
+        
+        if (threshold == 0)
+            return *this;
+
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->InplaceSoftThreshold(threshold),
+            this->m_GPUMatrix->InplaceSoftThreshold(threshold),
+            this->m_CPUSparseMatrix->InplaceSoftThreshold(threshold),
+            this->m_GPUSparseMatrix->InplaceSoftThreshold(threshold)
+            );
+
+        return *this;
+    }
    //Threshold truncating: this[i] = max( this[i], threshold )
    template<class ElemType>
    Matrix<ElemType>& Matrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
@ -2486,7 +2507,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            this,
            this->m_CPUMatrix->InplaceTruncateBottom(threshold), 
            this->m_GPUMatrix->InplaceTruncateBottom(threshold), 
-            NOT_IMPLEMENTED, 
+            this->m_CPUSparseMatrix->InplaceTruncateBottom(threshold),
            this->m_GPUSparseMatrix->InplaceTruncateBottom(threshold)
            );
                
@ -2542,18 +2563,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (!isfinite((float)threshold))
                return *this;
-            }
+        }
        else
        {
            if (!isfinite(threshold))
                return *this;
-            }
+        }

        DISPATCH_MATRIX_ON_FLAG(this,
            this,
            this->m_CPUMatrix->InplaceTruncateTop(threshold), 
            this->m_GPUMatrix->InplaceTruncateTop(threshold), 
-            NOT_IMPLEMENTED, 
+            this->m_CPUSparseMatrix->InplaceTruncateTop(threshold),
            this->m_GPUSparseMatrix->InplaceTruncateTop(threshold)
            );
                
@ -2626,7 +2647,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            nullptr,
            return this->m_CPUMatrix->SumOfElements(), 
            return this->m_GPUMatrix->SumOfElements(), 
-            NOT_IMPLEMENTED, 
+            return this->m_CPUSparseMatrix->SumOfElements(),
            return this->m_GPUSparseMatrix->SumOfElements()
            );
                
@ -2869,7 +2890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            nullptr,
            return this->m_CPUMatrix->FrobeniusNorm(), 
            return this->m_GPUMatrix->FrobeniusNorm(), 
-            NOT_IMPLEMENTED, 
+            return this->m_CPUSparseMatrix->FrobeniusNorm(),
            return this->m_GPUSparseMatrix->FrobeniusNorm()
            );                
        }
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -236,6 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
        Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
        Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
+        Matrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

        Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);