Fixed the bug by merging

2015-02-08 02:55:40 -05:00 · 2015-02-08 02:55:40 -05:00 · 2d1ec9471d
--- a/Common/TimerUtility.cpp
+++ b/Common/TimerUtility.cpp
@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #include "TimerUtility.h"

 #ifdef WIN32
@ -38,44 +37,3 @@ namespace Microsoft{
        }
    }
 }
-=======
-#include "TimerUtility.h"
-
-#ifdef WIN32
-#include <Windows.h>
-#else
-#include <time.h>
-#endif
-namespace Microsoft{
-    namespace MSR {
-        namespace CNTK {
-
-            //Returns the amount of milliseconds elapsed
-            unsigned long long Timer::MilliSecondElapsed()
-            {
-#ifdef WIN32
-                FILETIME ft;
-                LARGE_INTEGER li;
-
-                GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
-                li.LowPart = ft.dwLowDateTime;
-                li.HighPart = ft.dwHighDateTime;
-
-                unsigned long long ret = li.QuadPart;
-                ret -= 116444736000000000LL; // Make the values consistent with Linux. 
-                ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3) 
-
-                return ret;
-#else
-                timespec ts;
-                clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
-
-                UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
-
-                return ret;
-#endif
-            }
-        }
-    }
-}
->>>>>>> origin/master
--- a/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
@ -498,17 +498,13 @@ public:
                // I.e. our chunks are a little larger than wanted (on av. half the av. utterance length).
                if (thisallchunks.empty() || thisallchunks.back().totalframes > chunkframes || thisallchunks.back().numutterances() >= frameref::maxutterancesperchunk)
                {
-                    //fprintf(stderr, "hahahahahaahhaha %d %d %d %d %d\n", chunkframes,_totalframes, i, thisallchunks.back().totalframes,thisallchunks.back().numutterances());
-                    //fprintf(stderr, "hahahahahaahahah %d\n", thisallchunks.size());
                    thisallchunks.push_back (utterancechunkdata());

-                fprintf(stderr, "after  hahahahahaahhaha %d %d %d %d %d %d\n", chunkframes,_totalframes, i, thisallchunks.back().totalframes,thisallchunks.back().numutterances(),utteranceset.size());

                }
                // append utterance to last chunk
                utterancechunkdata & currentchunk = thisallchunks.back();
                    //std::move(utteranceset[i]);
-                //fprintf(stderr, "after  hahahahahaahhaha %d %d %d %d %d %d\n", chunkframes,_totalframes, i, thisallchunks.back().totalframes,thisallchunks.back().numutterances(),utteranceset.size());

                currentchunk.push_back (std::move (utteranceset[i]));    // move it out from our temp array into the chunk
                // TODO: above push_back does not actually 'move' because the internal push_back does not accept that
--- a/DataReader/SequenceReader/SequenceParser.h
+++ b/DataReader/SequenceReader/SequenceParser.h
@ -1,4 +1,3 @@
-<<<<<<< HEAD
 // SequenceParser.h : Parses the UCI format using a custom state machine (for speed)
 //
 //
@ -636,621 +635,3 @@ public:
    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos);

 };
-=======
-// SequenceParser.h : Parses the UCI format using a custom state machine (for speed)
-//
-//
-// <copyright file="SequenceParser.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#include <string>
-#include <vector>
-#include <assert.h>
-#include <fstream>
-#include <map>
-#include <stdint.h>
-
-using namespace std;
-
-#define MAXSTRING 500000
-// UCI label location types
-enum LabelMode
-{
-    LabelNone = 0,
-    LabelFirst = 1,
-    LabelLast = 2,
-};
-
-enum ParseMode
-{
-    ParseNormal = 0,
-    ParseLineCount = 1
-};
-
-enum SequenceFlags
-{
-    seqFlagNull = 0,
-    seqFlagLineBreak = 1, // line break on the parsed line
-    seqFlagEmptyLine = 2, // empty line
-    seqFlagStartLabel = 4,
-    seqFlagStopLabel = 8
-};
-
-// SequencePosition, save the ending indexes into the array for a sequence
-struct SequencePosition
-{
-    size_t numberPos; // max position in the number array for this sequence
-    size_t labelPos; // max position in the label array for this sequence
-    unsigned flags; // flags that apply to this sequence
-    SequencePosition(size_t numPos, size_t labelPos, unsigned flags):
-        numberPos(numPos), labelPos(labelPos), flags(flags)
-    {}
-};
-
-// SequenceParser - the parser for the UCI format files
-// for ultimate speed, this class implements a state machine to read these format files
-template <typename NumType, typename LabelType=int>
-class SequenceParser
-{
-protected:
-    enum ParseState
-    {
-        WholeNumber = 0,
-        Remainder = 1,
-        Exponent = 2,
-        Whitespace = 3,
-        Sign = 4,
-        ExponentSign = 5,
-        Period = 6,
-        TheLetterE = 7,
-        EndOfLine = 8, 
-        Label = 9, // any non-number things we run into
-        ParseStateMax = 10, // number of parse states
-        LineCountEOL = 10,
-        LineCountOther = 11,
-        AllStateMax = 12
-    };
-
-    // type of label processing
-    ParseMode m_parseMode;
-
-    // definition of label and feature dimensions
-    size_t m_dimFeatures;
-
-    size_t m_dimLabelsIn;
-    std::string m_beginSequenceIn; // starting sequence string (i.e. <s>)
-    std::string m_endSequenceIn; // ending sequence string (i.e. </s>)
-
-    size_t m_dimLabelsOut;
-    std::string m_beginSequenceOut; // starting sequence string (i.e. 'O')
-    std::string m_endSequenceOut; // ending sequence string (i.e. 'O')
-
-    // level of screen output
-    int m_traceLevel;
-
-    // current state of the state machine
-    ParseState m_current_state;
-
-    // state tables
-    DWORD *m_stateTable;
-
-    // numeric state machine variables
-    double m_partialResult;
-    double m_builtUpNumber;
-    double m_divider;
-    double m_wholeNumberMultiplier;
-    double m_exponentMultiplier;
-
-    // label state machine variables
-    size_t m_spaceDelimitedStart;
-    size_t m_spaceDelimitedMax; // start of the next whitespace sequence (one past the end of the last word)
-    int m_numbersConvertedThisLine;
-    int m_labelsConvertedThisLine;
-    int m_elementsConvertedThisLine;
-
-    // sequence state machine variables
-    bool m_beginSequence;
-    bool m_endSequence;
-    std::string m_beginTag;
-    std::string m_endTag;
-
-    // global stats
-    int m_totalNumbersConverted;
-    int m_totalLabelsConverted;
-
-    // file positions/buffer
-    FILE * m_pFile;
-    int64_t m_byteCounter;
-    int64_t m_fileSize;
-
-    BYTE * m_fileBuffer;
-    size_t m_bufferStart;
-    size_t m_bufferSize;
-
-    // last label was a string (for last label processing)
-    bool m_lastLabelIsString;
-
-    // vectors to append to
-    std::vector<NumType>* m_numbers; // pointer to vectors to append with numbers
-    std::vector<LabelType>* m_labels; // pointer to vector to append with labels (may be numeric)
-    // FUTURE: do we want a vector to collect string labels in the non string label case? (signifies an error)
-
-    // SetState for a particular value
-    void SetState(int value, ParseState m_current_state, ParseState next_state);
-
-    // SetStateRange - set states transitions for a range of values
-    void SetStateRange(int value1, int value2, ParseState m_current_state, ParseState next_state);
-
-    // SetupStateTables - setup state transition tables for each state
-    // each state has a block of 256 states indexed by the incoming character
-    void SetupStateTables();
-
-    // reset all line state variables
-    void PrepareStartLine();
-
-    // reset all number accumulation variables
-    void PrepareStartNumber();
-
-    // reset all state variables to start reading at a new position
-    void PrepareStartPosition(size_t position);
-
-    // UpdateBuffer - load the next buffer full of data
-    // returns - number of records read
-    size_t UpdateBuffer();
-
-public:
-
-    // SequenceParser constructor
-    SequenceParser();
-    // setup all the state variables and state tables for state machine
-    void Init();
-
-    // Parser destructor
-    ~SequenceParser();
-
-private:
-    // DoneWithLabel - Called when a string label is found
-    void DoneWithLabel();
-
-    // Called when a number is complete
-    void DoneWithValue();
-
-    // store label is specialized by LabelType
-    void StoreLabel(NumType value);
-
-    // StoreLastLabel - store the last label (for numeric types), tranfers to label vector
-    // string label types handled in specialization
-    void StoreLastLabel();
-
-public:
-    // SetParseMode - Set the parsing mode
-    // mode - set mode to either ParseLineCount, or ParseNormal
-    void SetParseMode(ParseMode mode);
-
-    // SetTraceLevel - Set the level of screen output
-    // traceLevel - traceLevel, zero means no output, 1 epoch related output, > 1 all output
-    void SetTraceLevel(int traceLevel);
-
-
-    // ParseInit - Initialize a parse of a file
-    // fileName - path to the file to open
-    // dimFeatures - number of features for precomputed features
-    // dimLabelsIn - number of lables possible on input
-    // dimLabelsOut - number of labels possible on output
-    // beginSequenceIn - beginSequence input label
-    // endSequenceIn - endSequence input label
-    // beginSequenceOut - beginSequence output label
-    // endSequenceOut - endSequence output label
-    // bufferSize - size of temporary buffer to store reads
-    // startPosition - file position on which we should start
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O", size_t bufferSize=1024*256, size_t startPosition=0)
-    {
-        assert(fileName != NULL);
-        m_dimFeatures = dimFeatures;
-        m_dimLabelsIn = dimLabelsIn;
-        m_beginSequenceIn = beginSequenceIn;
-        m_endSequenceIn = endSequenceIn;
-        m_dimLabelsOut = dimLabelsOut;
-        m_beginSequenceOut = beginSequenceOut;
-        m_endSequenceOut = endSequenceOut;
-
-        m_parseMode = ParseNormal;
-        m_traceLevel = 0;
-        m_bufferSize = bufferSize;
-        m_bufferStart = startPosition;
-
-        m_beginTag = m_beginSequenceIn;
-        m_endTag = m_endSequenceIn;
-
-        // if we have a file already open, cleanup
-        if (m_pFile != NULL)
-            SequenceParser<NumType, LabelType>::~SequenceParser();
-
-        errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
-        if (err)
-            RuntimeError("SequenceParser::ParseInit - error opening file"); 
-        int rc = _fseeki64(m_pFile, 0, SEEK_END);
-        if (rc)
-            RuntimeError("SequenceParser::ParseInit - error seeking in file");
-
-        m_fileSize = GetFilePosition();
-        m_fileBuffer = new BYTE[m_bufferSize];
-        SetFilePosition(startPosition);
-    }
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
-    {
-        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
-        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
-
-        // transfer to member variables
-        m_numbers = numbers;
-        m_labels = labels;
-
-        long TickStart = GetTickCount( );
-        long recordCount = 0;
-        long lineCount = 0;
-        size_t bufferIndex = m_byteCounter-m_bufferStart;
-        SequencePosition sequencePositionLast(0,0,seqFlagNull);
-        while (m_byteCounter < m_fileSize && recordCount < recordsRequested)
-        {
-            // check to see if we need to update the buffer
-            if (bufferIndex >= m_bufferSize)
-            {
-                UpdateBuffer();
-                bufferIndex = m_byteCounter-m_bufferStart;
-            }
-
-            char ch = m_fileBuffer[bufferIndex];
-
-            ParseState nextState = (ParseState)m_stateTable[(m_current_state<<8)+ch];
-
-            if( nextState <= Exponent )
-            {
-                m_builtUpNumber = m_builtUpNumber * 10 + (ch - '0');
-                // if we are in the decimal portion of a number increase the divider
-                if (nextState == Remainder)
-                    m_divider *= 10;
-            }
-
-            // only do a test on a state transition
-            if (m_current_state != nextState)
-            {
-                // System.Diagnostics.Debug.WriteLine("Current state = " + m_current_state + ", next state = " + nextState);
-
-                // if the nextState is a label, we don't want to do any number processing, it's a number prefixed string
-                if (nextState != Label)
-                {
-                    // do the numeric processing
-                    switch (m_current_state)
-                    {
-                    case TheLetterE:
-                        if (m_divider != 0) // decimal number
-                            m_partialResult += m_builtUpNumber / m_divider;
-                        else // integer
-                            m_partialResult = m_builtUpNumber;
-                        m_builtUpNumber = 0;
-                        break;
-                    case WholeNumber:
-                        // could be followed by a remainder, or an exponent
-                        if (nextState != TheLetterE)
-                            if( nextState != Period)
-                                DoneWithValue();
-                        if (nextState == Period)
-                        {
-                            m_partialResult = m_builtUpNumber;
-                            m_divider = 1;
-                            m_builtUpNumber = 0;
-                        }
-                        break;
-                    case Remainder:
-                        // can only be followed by a exponent
-                        if (nextState != TheLetterE)
-                            DoneWithValue();
-                        break;
-                    case Exponent:
-                        DoneWithValue();
-                        break;
-                    }
-                }
-
-                // label handling
-                switch (m_current_state)
-                {
-                case Label:
-                    DoneWithLabel();
-                    break;
-                case EndOfLine:
-                    if (seqPos)
-                    {
-                        SequencePosition sequencePos(numbers->size(), labels->size(), 
-                            m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-                        // add a sequence element to the list
-                        seqPos->push_back(sequencePos);
-                        sequencePositionLast = sequencePos;
-                    }
-                
-                    // end of sequence determines record separation
-                    if (m_endSequence)
-                        recordCount = (long)labels->size();
-
-                    PrepareStartLine();
-                    break;
-                case Whitespace:
-                    // this is the start of the next space delimited entity
-                    if (nextState != EndOfLine)
-                        m_spaceDelimitedStart = m_byteCounter;
-                    break;
-                }
-
-                // label handling for next state
-                switch (nextState)
-                {
-                // do sign processing on nextState, since we still have the character handy
-                case Sign:
-                    if (ch == '-')
-                        m_wholeNumberMultiplier = -1;
-                    break;
-                case ExponentSign:
-                    if (ch == '-')
-                        m_exponentMultiplier = -1;
-                    break;
-                // going into whitespace or endOfLine, so end of space delimited entity
-                case Whitespace:
-                    m_spaceDelimitedMax = m_byteCounter;
-                    // hit whitespace and nobody processed anything, so add as label
-                    //if (m_elementsConvertedThisLine == elementsProcessed)
-                    //    DoneWithLabel();
-                    break;
-                case EndOfLine:
-                    if (m_current_state != Whitespace)
-                    {
-                        m_spaceDelimitedMax = m_byteCounter;
-                        // hit whitespace and nobody processed anything, so add as label
-                        //if (m_elementsConvertedThisLine == elementsProcessed)
-                        //    DoneWithLabel();
-                    }
-                    // process the label at the end of a line
-                    //if (m_labelMode == LabelLast && m_labels != NULL)
-                    //{
-                    //    StoreLastLabel();
-                    //}
-                    // intentional fall-through
-                case LineCountEOL:
-                    lineCount++;  // done with another record
-                    if (m_traceLevel > 1)
-                    {
-                        // print progress dots
-                        if (recordCount % 100 == 0)
-                        {
-                            if (recordCount % 1000 == 0)
-                            {
-                                if (recordCount % 10000 == 0)
-                                {
-                                    fprintf(stderr, "#");
-                                }
-                                else
-                                {
-                                    fprintf(stderr, "+");
-                                }
-                            }
-                            else
-                            {
-                                fprintf(stderr, ".");
-                            }
-                        }
-                    }
-                    break;
-                case LineCountOther:
-                    m_spaceDelimitedStart = m_byteCounter;
-                    break;
-                }
-            }
-
-            m_current_state = nextState;
-
-            // move to next character
-            m_byteCounter++;
-            bufferIndex++;
-        } // while
-
-        // at the end of the file we may need to add an additional sequencePosition push
-        // this could probably be fixed by taking another pass through the loop above, but this is easier
-        if (seqPos)
-        {
-            SequencePosition sequencePos(numbers->size(), labels->size(), 
-                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-            // add the final sequence element if needed
-            if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos))
-            {
-                seqPos->push_back(sequencePos);
-            }
-        }
-
-        long TickStop = GetTickCount( );
-
-        long TickDelta = TickStop - TickStart;
-
-        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
-        return lineCount;
-    }
-
-
-    int64_t GetFilePosition();
-    void SetFilePosition(int64_t position);
-
-    // HasMoreData - test if the current dataset have more data
-    // returns - true if it does, false if not
-    bool HasMoreData();
-};
-
-// StoreLabel - string version gets last space delimited string and stores in labels vector
-template <>
-void SequenceParser<float, std::string>::StoreLabel(float finalResult);
-
-// DoneWithLabel - string version stores string label
-template <>
-void SequenceParser<float, std::string>::DoneWithLabel();
-
-// StoreLastLabel - string version
-template <>
-void SequenceParser<float, std::string>::StoreLastLabel();
-
-// NOTE: Current code is identical to float, don't know how to specialize with template parameter that only covers one parameter
-
-// StoreLabel - string version gets last space delimited string and stores in labels vector
-template <>
-void SequenceParser<double, std::string>::StoreLabel(double finalResult);
-
-// DoneWithLabel - string version stores string label
-template <>
-void SequenceParser<double, std::string>::DoneWithLabel();
-
-// StoreLastLabel - string version
-template <>
-void SequenceParser<double, std::string>::StoreLastLabel();
-
-/// language model sequence parser
-template <typename NumType, typename LabelType>
-class LMSequenceParser : public SequenceParser<NumType, LabelType>
-{
-protected:
-    FILE * mFile; 
-    std::wstring mFileName; 
-
-public:
-    LMSequenceParser() { 
-        mFile = nullptr; 
-    };
-    ~LMSequenceParser() { 
-        if (mFile) fclose(mFile); 
-    }
-
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O")
-    {
-        assert(fileName != NULL);
-        mFileName = fileName;
-        m_dimFeatures = dimFeatures;
-        m_dimLabelsIn = dimLabelsIn;
-        m_beginSequenceIn = beginSequenceIn;
-        m_endSequenceIn = endSequenceIn;
-        m_dimLabelsOut = dimLabelsOut;
-        m_beginSequenceOut = beginSequenceOut;
-        m_endSequenceOut = endSequenceOut;
-
-        m_parseMode = ParseNormal;
-        m_traceLevel = 0;
-        m_bufferSize = 0;
-        m_bufferStart = 0;
-
-        m_beginTag = m_beginSequenceIn;
-        m_endTag = m_endSequenceIn;
-
-        m_fileSize = -1;
-        m_fileBuffer = NULL;
-
-        if (mFile) fclose(mFile);
-
-        if (_wfopen_s(&mFile, fileName, L"rt") != 0)
-            RuntimeError("cannot open file %s", fileName);
-    }
-
-    void ParseReset()
-    {
-        if (mFile) fseek(mFile, 0, SEEK_SET);
-    }
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
-    {
-        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
-        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
-
-        // transfer to member variables
-        m_numbers = numbers;
-        m_labels = labels;
-
-        long TickStart = GetTickCount( );
-        long recordCount = 0;
-        long orgRecordCount = (long)labels->size();
-        long lineCount = 0;
-        SequencePosition sequencePositionLast(0,0,seqFlagNull);
-        /// get line
-        char ch2[MAXSTRING]; 
-        while (recordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
-        {
-            
-            string ch = ch2; 
-            std::vector<string> vstr; 
-            vstr = sep_string(ch, " ");
-            if (vstr.size() < 3) 
-                continue;
-
-            for (size_t i = 0; i < vstr.size(); i++)
-            {
-                labels->push_back(vstr[i]);
-            }
-            SequencePosition sequencePos(numbers->size(), labels->size(), 
-                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-            // add a sequence element to the list
-            seqPos->push_back(sequencePos);
-            sequencePositionLast = sequencePos;
-
-            recordCount = (long)labels->size() - orgRecordCount;
-
-            lineCount ++;
-        } // while
-
-        long TickStop = GetTickCount( );
-
-        long TickDelta = TickStop - TickStart;
-
-        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
-        return lineCount;
-    }
-
-};
-
-typedef struct{
-    size_t sLen;
-    size_t sBegin;
-    size_t sEnd;
-} stSentenceInfo; 
-/// language model sequence parser
-template <typename NumType, typename LabelType>
-class LMBatchSequenceParser: public LMSequenceParser<NumType, LabelType>
-{
-public:
-    vector<stSentenceInfo> mSentenceIndex2SentenceInfo;
-
-public:
-    LMBatchSequenceParser() { };
-    ~LMBatchSequenceParser() { }
-
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O");
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos);
-
-};
->>>>>>> origin/master
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
--- a/Demos/Simple/Simple.config
+++ b/Demos/Simple/Simple.config
@ -3,7 +3,7 @@ command=Simple_Demo:Simple_Demo_Output

 # deviceId=-1 for CPU, >=0 for GPU devices
 DeviceNumber=0
-stderr=Demo
+#stderr=Demo

 precision=float

--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
--- a/MachineLearning/cn/cn.cpp
+++ b/MachineLearning/cn/cn.cpp
@ -1,4 +1,3 @@
-<<<<<<< HEAD
 //
 // <copyright file="cn.cpp" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
@ -792,777 +791,3 @@ int main(int argc, char* argv[])
    return ret;
 }
 #endif
-
-=======
-//
-// <copyright file="cn.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// cn.cpp : Defines the entry point for the console application.
-//
-
-#define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
-
-#include "stdafx.h"
-#include "ComputationNetwork.h"
-#include "ComputationNode.h"
-#include "DataReader.h"
-#include "DataWriter.h"
-#include "SimpleNetworkBuilder.h"
-#include "NDLNetworkBuilder.h"
-#include "SynchronousExecutionEngine.h"
-#include "ModelEditLanguage.h"
-#include "SGD.h"
-#include <string>
-#include "commandArgUtil.h"
-#include "SimpleEvaluator.h"
-#include "SimpleOutputWriter.h"
-#include <chrono>
-#include <algorithm>
-#if defined(_WIN32)
-#include "io.h"
-#endif
-#include "hostname.h"
-#include "buildinfo.h"
-#ifdef LEAKDETECT
-#include "vld.h" // for memory leak detection
-#endif
-#include <vector>
-#include "BestGpu.h"
-
-// MPI builds on windows require the following installed to "c:\program files\Microsoft MPI\"
-// HPC Pack 2012 R2 MS-MPI Redistributable Package
-// http://www.microsoft.com/en-us/download/details.aspx?id=41634
-
-#ifdef MPI_SUPPORT
-#include "mpi.h"
-#pragma comment(lib, "msmpi.lib")
-#endif
-int numProcs;
-int myRank;
-
-using namespace std;
-using namespace Microsoft::MSR::CNTK;
-
-// internal test routine forward declaration
-template <typename ElemType>
-void TestCn(const ConfigParameters& config);
-
-void RedirectStdErr(wstring logpath)
-{
-    fprintf (stderr, "Redirecting stderr to file %S\n", logpath.c_str());
-    msra::files::make_intermediate_dirs (logpath);
-    auto_file_ptr f (logpath.c_str(), "wb");
-    if (dup2 (fileno (f), 2) == -1)
-        RuntimeError ("unexpected failure to redirect stderr to log file");
-    setvbuf (stderr, NULL, _IONBF, 16384);   // unbuffer it
-}
-
-std::string WCharToString(const wchar_t* wst)
-{
-    std::wstring ws(wst);
-    std::string s(ws.begin(), ws.end());
-    s.assign(ws.begin(), ws.end());
-    return s;
-}
-
-template <typename ElemType>
-void DumpNodeInfo(const ConfigParameters& config)
-{
-    wstring modelPath = config("modelPath");
-    wstring nodeName = config("nodeName",L"__AllNodes__");
-    wstring defOutFilePath = modelPath + L"." + nodeName + L".txt";
-    wstring outputFile = config("outputFile",  WCharToString(defOutFilePath.c_str()).c_str());
-    bool printValues = config("printValues", "true");
-
-    ComputationNetwork<ElemType> net(-1);  //always use CPU
-    net.LoadFromFile(modelPath);
-    net.DumpNodeInfoToFile(nodeName, printValues, outputFile);
-}
-
-template <typename ElemType>
-void DoEvalBase(const ConfigParameters& config, IDataReader<ElemType>& reader)
-{
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-
-    int traceLevel = config("traceLevel", "0");    
-    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
-
-    ConfigArray evalNodeNames = config("evalNodeNames","");
-    vector<wstring> evalNodeNamesVector;
-    for (int i=0; i < evalNodeNames.size(); ++i)
-    {
-        evalNodeNamesVector.push_back(evalNodeNames[i]);
-    }
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
-    eval.Evaluate(reader, evalNodeNamesVector, mbSize[0], epochSize);
-}
-
-template <typename ElemType>
-void DoEval(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DoEvalBase(config, testDataReader);
-}
-
-template <typename ElemType>
-void DoEvalUnroll(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-    wstring path2EvalResults = config("path2EvalResults", L"");
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleEvaluator<ElemType> eval(net);
-    ElemType evalEntropy; 
-    eval.EvaluateUnroll(testDataReader, mbSize[0], evalEntropy,  path2EvalResults == L""? nullptr : path2EvalResults.c_str(), epochSize);
-}
-
-template <typename ElemType>
-void DoCrossValidate(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-
-    ConfigArray cvIntervalConfig = config("crossValidationInterval");
-    intargvector cvInterval = cvIntervalConfig;
-
-    size_t sleepSecondsBetweenRuns = config("sleepTimeBetweenRuns", "0"); 
-
-    int traceLevel = config("traceLevel", "0");    
-    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
-
-    ConfigArray evalNodeNames = config("evalNodeNames","");
-    vector<wstring> evalNodeNamesVector;
-    for (int i=0; i < evalNodeNames.size(); ++i)
-    {
-        evalNodeNamesVector.push_back(evalNodeNames[i]);
-    }
-
-    std::vector<std::vector<ElemType>> cvErrorResults;
-    std::vector<std::wstring> cvModels;
-
-    DataReader<ElemType> cvDataReader(readerConfig);
-
-    bool finalModelEvaluated = false;
-    for (size_t i=cvInterval[0]; i<=cvInterval[2]; i+=cvInterval[1])
-    {
-        wstring cvModelPath = msra::strfun::wstrprintf (L"%ls.%lld", modelPath.c_str(), i);
-
-        if (!fexists (cvModelPath)) 
-        {
-            fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
-            if (finalModelEvaluated || !fexists (modelPath))
-                continue; // file missing
-            else 
-            {
-                cvModelPath = modelPath;
-                finalModelEvaluated = true;
-            }
-        }
-
-        cvModels.push_back(cvModelPath);
-        ComputationNetwork<ElemType> net(deviceId);
-        net.LoadFromFile(cvModelPath);
-        net.ResetEvalTimeStamp();
-
-        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
-
-        fprintf(stderr, "model %ls --> \n",cvModelPath.c_str());
-        std::vector<ElemType> evalErrors;
-        evalErrors = eval.Evaluate(cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
-        cvErrorResults.push_back(evalErrors);
-
-        ::Sleep(1000*sleepSecondsBetweenRuns);
-    }
-
-    //find best model
-    if (cvErrorResults.size() == 0)
-        throw std::logic_error("No model is evaluated.");
-
-    std::vector<ElemType> minErrors;
-    std::vector<int> minErrIds;
-    std::vector<ElemType> evalErrors = cvErrorResults[0];
-    for (int i=0; i < evalErrors.size(); ++i)
-    {
-        minErrors.push_back(evalErrors[i]);
-        minErrIds.push_back(0);
-    }
-
-    for (int i=0; i<cvErrorResults.size(); i++)
-    {
-        evalErrors = cvErrorResults[i];
-        for (int j=0; j<evalErrors.size(); j++)
-        {
-            if (evalErrors[j] < minErrors[j])
-            {
-                minErrors[j] = evalErrors[j];
-                minErrIds[j] = i;
-            }        
-        }
-    }
-
-    fprintf(stderr, "Best models:\n");
-    fprintf(stderr,"------------\n");
-    for (int i=0; i < minErrors.size(); ++i)
-    {
-        fprintf(stderr,"Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
-    }
-}
-
-template <typename ElemType>
-void DoWriteOutput(const ConfigParameters& config)
-{
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-    readerConfig.Insert("randomize","None");  //we don't want randomization when output results
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "2048");
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-    
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-
-    ConfigArray outputNodeNames = config("outputNodeNames","");
-    vector<wstring> outputNodeNamesVector;
-    for (int i=0; i < outputNodeNames.size(); ++i)
-    {
-        outputNodeNamesVector.push_back(outputNodeNames[i]);
-    }
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleOutputWriter<ElemType> writer(net, 1);
-
-    if (config.Exists("writer"))
-    {
-        ConfigParameters writerConfig (config("writer"));
-        bool bWriterUnittest = writerConfig("unittest","false");
-        DataWriter<ElemType> testDataWriter(writerConfig);
-        writer.WriteOutput(testDataReader,mbSize[0], testDataWriter, outputNodeNamesVector, epochSize, bWriterUnittest);
-    }
-    else if (config.Exists("outputPath"))
-    {
-        wstring outputPath = config("outputPath"); // crashes if no default given? 
-        writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, epochSize);
-    }
-    //writer.WriteOutput(testDataReader, mbSize[0], testDataWriter, outputNodeNamesVector, epochSize);
-}
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-TrainingCriterion ParseTrainingCriterionString(wstring s)
-{
-    msra::strfun::tolower_ascii(s);
-    if (s==L"crossentropywithsoftmax")
-        return TrainingCriterion::CrossEntropyWithSoftmax;
-    else if (s==L"squareerror")
-        return TrainingCriterion::SquareError;
-    else if (s!=L"classcrossentropywithsoftmax")    // (twisted logic to keep compiler happy w.r.t. not returning from LogicError)
-        LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (CrossEntropyWithSoftmax | SquareError | ClassCrossEntropyWithSoftmax)");
-    return TrainingCriterion::ClassCrossEntropyWithSoftmax;
-}
-
-EvalCriterion ParseEvalCriterionString(wstring s)
-{
-    msra::strfun::tolower_ascii(s);
-    if (s==L"errorprediction")
-        return EvalCriterion::ErrorPrediction;
-    else if (s==L"crossentropywithsoftmax")
-        return EvalCriterion::CrossEntropyWithSoftmax;
-    else if (s==L"classcrossentropywithsoftmax")
-        return EvalCriterion::ClassCrossEntropyWithSoftmax;
-    else if (s!=L"squareerror")
-        LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (ErrorPrediction | CrossEntropyWithSoftmax | SquareError)");
-    return EvalCriterion::SquareError;
-}
-
-}}};
-
-template <typename ElemType>
-void DoCreateLabelMap(const ConfigParameters& config)
-{
-    // this gets the section name we are interested in
-    std::string section = config("section");
-    // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution)
-    ConfigParameters configSection (config(section));
-    ConfigParameters readerConfig (configSection("reader"));
-    readerConfig.Insert("allowMapCreation","true");
-    DEVICEID_TYPE deviceId = CPUDEVICE;
-    size_t minibatchSize = config("minibatchSize", "2048");
-    int traceLevel = config("traceLevel","0");
-    std::vector<std::wstring> featureNames;
-    std::vector<std::wstring> labelNames;
-    GetFileConfigNames(readerConfig, featureNames, labelNames);
-
-    // setup minibatch matrices
-    Matrix<ElemType> featuresMatrix(deviceId);
-    Matrix<ElemType> labelsMatrix(deviceId);
-    std::map<std::wstring, Matrix<ElemType>*> matrices;
-    matrices[featureNames[0]] = &featuresMatrix;
-    if (labelNames.size() == 0)
-        RuntimeError("CreateLabelMap: no labels found to process");
-
-    // now create the reader and loop through the entire dataset to get all the labels
-    auto start = std::chrono::system_clock::now();
-    for (const std::wstring& labelsName: labelNames)
-    {
-        // take the last label file defined (the other one might be input)
-        matrices[labelsName] = &labelsMatrix;
-
-        // get the label mapping file name
-        ConfigParameters labelConfig (readerConfig(labelsName));
-        std::string labelMappingFile;
-        if (labelConfig.ExistsCurrent("labelMappingFile"))
-            labelMappingFile = labelConfig("labelMappingFile");
-        else if (readerConfig.ExistsCurrent("labelMappingFile")) 
-            labelMappingFile = labelConfig("labelMappingFile");
-        else
-            RuntimeError("CreateLabelMap: No labelMappingFile defined");
-
-        if (fexists(labelMappingFile))
-        {
-            fprintf(stderr,"CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str());
-            return;
-        }
-        fprintf(stderr,"CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str());
-
-        DataReader<ElemType> dataReader(readerConfig);
-
-        dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize);
-        int count = 0;
-        while (dataReader.GetMinibatch(matrices))
-        {
-            Matrix<ElemType>& features = *matrices[featureNames[0]];
-            count += features.GetNumCols();
-            if (traceLevel > 1)
-                fprintf(stderr,"."); // progress meter
-        }
-        dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize);
-
-        // print the results
-        if (traceLevel > 0)
-            fprintf(stderr,"\nread %d labels and produced %s\n", count, labelMappingFile.c_str());
-    }
-    auto end = std::chrono::system_clock::now();
-    auto elapsed = end-start;
-    if (traceLevel > 1)
-        fprintf(stderr, "%f seconds elapsed\n", (float)(std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count())/1000);
-}
-
-
-template <typename ElemType>
-void DoTrain(const ConfigParameters& config)
-{
-    ConfigParameters configSGD (config("SGD"));
-    bool makeMode = config("makeMode", "true");
-
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    IComputationNetBuilder<ElemType>* netBuilder = NULL;
-
-    if (config.Exists("NDLNetworkBuilder"))
-    {
-        ConfigParameters configNDL (config("NDLNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new NDLBuilder<ElemType>(configNDL);
-    }
-    else if (config.Exists("SimpleNetworkBuilder"))
-    {
-        ConfigParameters configSNB (config("SimpleNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(configSNB);
-    }
-    else
-    {
-        RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified" );
-    }
-
-    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
-
-    DataReader<ElemType>* cvDataReader = nullptr;
-    ConfigParameters cvReaderConfig (config("cvReader", L""));
-    
-    if (cvReaderConfig.size() != 0)
-    {
-        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
-        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
-    }
-
-    SGD<ElemType> sgd(configSGD);
-
-    sgd.Train(netBuilder, dataReader, cvDataReader, makeMode);
-
-    delete netBuilder;
-    delete dataReader;
-    delete cvDataReader;
-}
-
-template <typename ElemType>
-void DoAdapt(const ConfigParameters& config)
-{
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-
-    ConfigParameters configSGD (config("SGD"));
-    bool makeMode = config("makeMode", "true");
-
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
-
-    DataReader<ElemType>* cvDataReader = nullptr;
-    ConfigParameters cvReaderConfig (config("cvReader", L""));
-    
-    if (cvReaderConfig.size() != 0)
-    {
-        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
-        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
-    }
-
-    wstring origModelFileName = config("origModelFileName", L"");
-    wstring refNodeName = config("refNodeName", L"");
-
-    SGD<ElemType> sgd(configSGD);
-
-    sgd.Adapt(origModelFileName, refNodeName, dataReader, cvDataReader, deviceId, makeMode);
-
-    delete dataReader;
-    delete cvDataReader;
-}
-
-template <typename ElemType>
-void DoEdit(const ConfigParameters& config)
-{
-    wstring editPath = config("editPath");    
-    wstring ndlMacros = config("ndlMacros","");
-    NDLScript<ElemType> ndlScript;
-    if (!ndlMacros.empty())
-        ndlScript.LoadConfigFile(ndlMacros);
-    MELScript<ElemType> melScript;
-    melScript.LoadConfigFileAndResolveVariables(editPath, config);
-}
-
-template <typename ElemType>
-void DoConvertFromDbn(const ConfigParameters& config)
-{
-    //config.Insert("deviceId","-1"); //force using CPU
-
-    wstring modelPath = config("modelPath");
-    wstring dbnModelPath = config("dbnModelPath");
-
-    IComputationNetBuilder<ElemType>* netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(config);        
-    ComputationNetwork<ElemType>& net = netBuilder->LoadNetworkFromFile(dbnModelPath);
-    net.SaveToFile(modelPath);
-    delete (netBuilder);
-}
-// process the command
-template <typename ElemType>
-void DoCommand(const ConfigParameters& config)
-{
-    ConfigArray command = config("command", "train");
-    for (int i=0; i < command.size(); i++)
-    {
-        //get the configuration parameters that match the command
-        ConfigParameters commandParams (config(command[i]));
-        ConfigArray action = commandParams("action","train");
-
-        // determine the action to perform, and do it
-        for (int j=0; j < action.size(); j++)
-        {
-            if (action[j] == "train" || action[j] == "trainRNN")
-                DoTrain<ElemType>(commandParams);
-            else if (action[j] == "adapt")
-                DoAdapt<ElemType>(commandParams);
-            else if (action[j] == "test" || action[j] == "eval")
-                DoEval<ElemType>(commandParams);
-            else if (action[j] == "testunroll")
-                DoEvalUnroll<ElemType>(commandParams);
-            else if (action[j] == "edit")
-                DoEdit<ElemType>(commandParams);
-            else if (action[j] == "cv")
-                DoCrossValidate<ElemType>(commandParams);
-            else if (action[j] == "write")
-                DoWriteOutput<ElemType>(commandParams);
-            else if (action[j] == "devtest")
-                TestCn<ElemType>(config); // for "devtest" action pass the root config instead
-            else if (action[j] == "dumpnode")
-                DumpNodeInfo<ElemType>(commandParams);
-            else if (action[j] == "convertdbn")
-                DoConvertFromDbn<ElemType>(commandParams);
-            else if (action[j] == "createLabelMap")
-                DoCreateLabelMap<ElemType>(commandParams);
-            else
-                RuntimeError("unknown action: %s  in command set: %s", action[j].c_str(), command[i].c_str());
-                
-            NDLScript<ElemType> ndlScript;
-            ndlScript.ClearGlobal(); // clear global macros between commands
-        }
-    }
-}
-
-std::string TimeDateStamp()
-{
-#if 0   // "safe" version for Windows, not needed it seems
-    __time64_t localtime;
-
-    _time64 (&localtime);// get current time and date
-    struct tm now;
-    _localtime64_s (&now, &localtime);  // convert
-#else
-    time_t t = time(NULL);
-    struct tm now = *localtime(&t);
-#endif
-    char buf[30];
-    sprintf (buf, "%04d/%02d/%02d %02d:%02d:%02d", now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
-    return buf;
-}
-
-#ifdef MPI_SUPPORT
-// Oh, my gosh, this is going to be ugly. MPI_INIT needs a char* argv[], so let's interface.
-int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
-{
-    // this maps from the strings 
-    std::map<std::string, wchar_t*> recover_wstring;
-
-    // do the mapping to 8-bit encoding for MPI_Init()
-    vector<vector<char>> argv_string_vector;
-    transform(*argv, *argv + *argc, std::back_inserter(argv_string_vector),
-        [&recover_wstring](wchar_t*pws)->vector<char>
-        { 
-            std::string tmp = msra::strfun::utf8(std::wstring(pws));
-            recover_wstring[tmp] = pws;
-            vector<char> rv(tmp.begin(), tmp.end());
-            rv.push_back('\0');
-            return rv;
-        }
-        );
-    vector<char*> argv_charptr_vector;
-    transform(argv_string_vector.begin(), argv_string_vector.end(), std::back_inserter(argv_charptr_vector),
-        [](std::vector<char>&cs)->char*{ return &(cs[0]); }
-        );
-    char** argv_char = &(argv_charptr_vector[0]);
-
-    // Do the initialization
-    int rv = MPI_Init(argc, &argv_char);
-
-    // try and reconstruct how MPI_Init changed the argv
-    transform(argv_char, argv_char + *argc, stdext::checked_array_iterator<wchar_t**>(*argv, *argc),
-        [&recover_wstring](char*pc)->wchar_t*
-        {
-            auto it = recover_wstring.find(std::string(pc));
-            if (it == recover_wstring.end())
-                RuntimeError("Unexpected interaction between MPI_Init and command line parameters");
-            return it->second;
-        }
-        );
-
-    // pass through return value from internal call to MPI_Init()
-    return rv;
-}
-#endif
-
-void PrintBuiltInfo()
-{
-	fprintf(stderr, "-------------------------------------------------------------------\n");
-	fprintf(stderr, "Build info: \n\n");
-	fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
-	fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
-	fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
-	fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
-#ifdef _GIT_EXIST
-	fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
-	fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
-#endif
-	fprintf(stderr, "-------------------------------------------------------------------\n");
-
-}
-
-
-int wmain(int argc, wchar_t* argv[])
-{
-
-    try
-    {
-#ifdef MPI_SUPPORT
-        {
-            int rc;
-            rc = MPI_Init(&argc, &argv);
-            if (rc != MPI_SUCCESS)
-            {
-                MPI_Abort(MPI_COMM_WORLD, rc);
-                RuntimeError("Failure in MPI_Init: %d", rc);
-            }
-            MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
-            MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
-            fprintf(stderr, "MPI: RUNNING ON (%s), process %d/%d\n", getenv("COMPUTERNAME"), myRank, numProcs);
-            fflush(stderr);
-        }
-#else
-        numProcs = 1;
-        myRank = 0;
-#endif
-
-        ConfigParameters config;
-        std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
-
-        // get the command param set they want
-        wstring logpath = config("stderr", L"");
-		//  [1/26/2015 erw, add done file so that it can be used on HPC]
-		wstring DoneFile = config("DoneFile", L"");
-        ConfigArray command = config("command", "train");
-
-        if (logpath != L"")
-        {
-            for (int i=0; i < command.size(); i++)
-            {
-                logpath += L"_";
-                logpath += (wstring)command[i];
-            }
-            logpath += L".log";
-            if (numProcs > 1)
-            {
-                std::wostringstream oss;
-                oss << myRank;
-                logpath += L"rank" + oss.str();
-            }
-
-			RedirectStdErr(logpath);
-        }
-
-
-		PrintBuiltInfo();
-
-
-        std::string timestamp = TimeDateStamp();
-
-        if (myRank == 0) // main process
-        {
-            //dump config info
-            fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
-            fprintf(stderr, "command line options: \n");
-            for (int i = 1; i < argc; i++)
-                fprintf(stderr, "%s ", WCharToString(argv[i]).c_str());
-
-            // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
-            // and prints it.
-            fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
-            fprintf(stderr, "%s\n", rawConfigString.c_str());
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");
-
-            // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overriden at command line),
-            // All of these assignments will appear, even though only the last assignment matters.
-            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
-            fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
-
-            // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
-            // value it is set to will appear).
-            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
-            config.dumpWithResolvedVariables();
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
-
-            fprintf(stderr, "command: ");
-            for (int i = 0; i < command.size(); i++)
-            {
-                fprintf(stderr, "%s ", command[i].c_str());
-            }
-        }
-
-        //run commands
-        std::string type = config("precision", "float");
-        // accept old precision key for backward compatibility
-        if (config.Exists("type"))
-            type = config("type", "float");
-        if ( myRank == 0 )
-            fprintf(stderr, "\nprecision = %s\n", type.c_str());
-        if (type == "float")
-            DoCommand<float>(config);
-        else if (type == "double")
-            DoCommand<double>(config);
-        else
-            RuntimeError("invalid precision specified: %s", type.c_str());
-
-		// still here , write a DoneFile if necessary 
-		if (!DoneFile.empty()){
-			FILE* fp = fopenOrDie(DoneFile.c_str(), L"w");
-			fprintf(fp, "successfully finished at %s on %s\n",  TimeDateStamp().c_str(),GetHostName().c_str());
-			fcloseOrDie(fp);
-		}
-	}
-	catch (const std::exception &err)
-	{
-        fprintf(stderr, "EXCEPTION occurred: %s", err.what());
-#ifdef _DEBUG
-        DebugBreak();
-#endif
-        return EXIT_FAILURE;
-    }
-    catch(...)
-    {
-        fprintf(stderr, "Unknown ERROR occurred");
-#ifdef _DEBUG
-        DebugBreak();
-#endif
-        return EXIT_FAILURE;
-    }    
-#ifdef MPI_SUPPORT
-    MPI_Finalize();
-#endif
-    return EXIT_SUCCESS;
-}
->>>>>>> origin/master
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -1,4 +1,3 @@
-<<<<<<< HEAD
 //
 // <copyright file="CPUSparseMatrix.cpp" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
@ -961,967 +960,3 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class CPUSparseMatrix<double>;

 }}}
-=======
-//
-// <copyright file="CPUSparseMatrix.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// Math.cpp : Defines the exported functions for the DLL application.
-//
-
-#include "stdafx.h"
-#include <assert.h>
-#include <stdexcept>
-#include <omp.h>
-#include <math.h>
-#include "CPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include <random>
-#include <chrono>
-#ifdef    _WIN32
-#include <Windows.h>
-#endif
-#ifdef LEAKDETECT
-#include <vld.h>
-#endif
-
-#include "basetypes.h"
-#include "fileutil.h"
-
-
-#ifndef USE_MKL
-// use ACML as default. 
-// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above 
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64 variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h>  // requires ACML 5.3.0 and above
-#else
-// requires MKL 10.0 and above
-#endif
-
-// This is an example of an exported variable
-//MATH_API int nMath=0;
-
-// This is an example of an exported function.
-//MATH_API int fnMath(void)
-//{
-//    return 42;
-//}
-
-#ifndef USE_MKL  //MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR 
-#else
-#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, 
-#endif
-
-#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);}
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-#pragma region Helpful Enum Definitions
-    enum class MatrixOrder
-    {
-        RowMajor = 101,  // row-major arrays 
-        ColMajor = 102  // column-major arrays 
-    };
-
-    enum class MatrixTranspose : char
-    {
-        NoTrans = 'N', // trans='N'
-        Trans = 'T', // trans='T' 
-        ConjTrans = 'C' // trans='C'
-    };
-
-    enum class SymMatrixType : char
-    {
-        Up = 'U', // symmetric matrix is stored in the upper part
-        Low = 'L', // symmetric matrix is stored in thelower part
-        Full = 'F', //full populated
-        NotSymmetric = 'N' //not a symmetric matrix
-    };
-
-    enum class MatrixOpSide : char
-    {
-        Left = 'L', // left multiply
-        Right = 'R', // right multiply
-    };
-#pragma endregion Helpful Enum Definitions
-
-#pragma region Constructors and Destructor
-
-    //should only be used by constructors.
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ZeroInit()
-    {   
-        m_numRows = 0;
-        m_numCols = 0;
-        m_elemSizeAllocated = 0;
-        m_compIndexSize = 0;
-        m_externalBuffer = false;
-        m_computeDevice = CPUDEVICE;
-        m_nz = 0;
-        m_matrixName = NULL;   
-
-        //if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            m_colIdx = -1;
-            m_pArray = NULL;
-            m_unCompIndex = NULL;
-            m_compIndex = NULL;
-        } 
-        //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            m_blockSize = 0;      
-            m_pArray = NULL;
-            m_blockIds = NULL;
-        }
-    }
-
-    //should only be used by constructors.
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
-    {
-        if (format != MatrixFormat::matrixFormatSparseCSC && format != MatrixFormat::matrixFormatSparseCSR && format != MatrixFormat::matrixFormatSparseBlockCol && format != MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            throw std::logic_error("CPUSparseMatrix:  unsupported sparse matrix format");
-        }
-        m_format = format;
-        m_default = defaultElem();
-        ZeroInit();
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
-    {
-        CheckInit(format);
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
-    {
-        CheckInit(format);
-        Resize(numRows, numCols, size);
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
-    {       
-        if (m_matrixName!=NULL) 
-        {
-            delete[] m_matrixName;
-            m_matrixName = nullptr;
-        }
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            if(m_pArray != NULL) 
-                delete[] m_pArray;
-            if(m_unCompIndex != NULL) 
-                delete[] m_unCompIndex;
-            if(m_compIndex != NULL)
-                delete[] m_compIndex;
-        }  
-        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            if (m_pArray != NULL)
-                delete[] m_pArray;
-            if(m_blockIds != NULL) 
-                delete[] m_blockIds;
-        }
-    }
-
-
-
-#pragma endregion Constructors and Destructor
-
-#pragma region Basic Operators
-
-    //make sure call order in colume wise for CSC and row wise for CSR
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, const ElemType v)
-    {
-        if(m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseCSR) 
-        {
-            throw std::logic_error("CPUSparseMatrix:  unsupported SetValue() call.");
-        }
-
-        if(m_elemSizeAllocated < m_nz +1) //automatic resize
-        {
-            Resize(m_numRows, m_numCols, m_nz + 100);  //allocate 100 more elelemnts and keep existing values
-        }
-
-        if(row < 0 || row >= m_numRows) 
-        {
-            throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id");
-        }
-
-        if(col < 0 || col >= m_numCols) {
-            throw std::logic_error("CPUSparseMatrix: SetValue() invalid column id");
-        }
-
-        size_t r = (m_format == matrixFormatSparseCSC) ? row: col;
-        size_t c = (m_format == matrixFormatSparseCSC) ? col: row;
-
-        m_pArray[m_nz] = v;
-        m_unCompIndex[m_nz] = (CPUSPARSE_INDEX_TYPE)r;
-
-        //consistency check
-        if(c == m_colIdx && r <= m_unCompIndex[m_nz-1]) 
-        {
-            throw std::logic_error("CPUSparseMatrix:  SetValue is not called properly");
-        }
-
-        if (c != m_colIdx) 
-        {
-            m_compIndex[c] = CPUSPARSE_INDEX_TYPE(m_nz);
-            m_colIdx = (int) c;
-        } 
-        m_compIndex[c + 1] = CPUSPARSE_INDEX_TYPE(m_nz + 1);
-        m_nz++;
-    }
-
-    template<class ElemType>
-    ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        return m_pArray;
-    }
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
-    {               
-        size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
-        bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize);
-
-        m_numRows = numRows;
-        m_numCols = numCols;
-
-        if (reallocate)
-        {                
-            if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
-            {
-                ElemType *pArray = new ElemType[numNZElemToReserve];
-                CPUSPARSE_INDEX_TYPE *unCompIndex = new CPUSPARSE_INDEX_TYPE[numNZElemToReserve];
-                CPUSPARSE_INDEX_TYPE *compIndex = new CPUSPARSE_INDEX_TYPE[newCompIndexSize];
-                
-                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
-                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
-
-                if (keepExistingValues && m_nz > 0)
-                {
-                    assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve);
-                    memcpy(pArray, m_pArray, NzSize());
-                    memcpy(unCompIndex, m_unCompIndex, MajorIndexSize());
-                    memcpy(compIndex, m_compIndex, SecondaryIndexSize());
-                }
-
-                if (m_pArray != NULL)
-                    delete[] m_pArray;
-                if (m_unCompIndex != NULL)
-                    delete[] m_unCompIndex;
-                if (m_compIndex != NULL)
-                    delete[] m_compIndex;
-
-                m_pArray = pArray;
-                m_unCompIndex = unCompIndex;
-                m_compIndex = compIndex;
-            }
-            else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-            {
-                ElemType *blockVal = new ElemType[numNZElemToReserve];
-                size_t *blockIds = new size_t[newCompIndexSize];
-
-                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
-                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
-
-                if (keepExistingValues && m_elemSizeAllocated > 0)
-                {
-                    assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
-                    memcpy(blockVal, m_pArray, NzSize());
-                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
-                }
-
-                if (m_pArray != NULL)
-                    delete[] m_pArray;
-                if(m_blockIds != NULL) 
-                    delete[] m_blockIds;
-
-                m_pArray = blockVal;
-                m_blockIds = blockIds;
-            }
-
-            m_elemSizeAllocated = numNZElemToReserve;
-            m_compIndexSize = newCompIndexSize;
-        }
-    }
-
-    //Reset matrix so it can be reused
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::Reset()
-    {                
-        m_nz = 0;
-        m_colIdx = -1;
-        m_blockSize = 0;
-    }
-
-    //c = alpha*op(lhs) * op(rhs) + beta*c
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
-        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
-
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty())
-            throw std::logic_error("MultiplyAndWeightedAdd:  one of the input matrix is empty.");
-
-        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
-        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
-        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
-        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
-
-        assert (m>0 && k>0 && l>0 && n>0);  //converting from size_t to int may cause overflow
-        assert (k == l);
-        if (k != l) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match.");
-        }
-
-        if (c.GetNumRows() != m || c.GetNumCols() != n) 
-        {
-            c.Resize(m,n);
-        }         
-
-        if (beta == 0)
-        {
-            memset(c.GetArray(), 0, sizeof(ElemType) * c.GetNumElements());
-        }
-        else if (beta != 1)
-        {
-#pragma omp parallel for
-            foreach_coord(i,j,c)
-            {
-                c(i,j) = beta * c(i,j); 
-            } 
-        }
-
-        if (rhs.GetFormat() != matrixFormatSparseCSC)
-            NOT_IMPLEMENTED;
-
-        if (!transposeA && !transposeB)
-        {
-            for(size_t j = 0; j < rhs.GetNumCols(); j++) 
-            {
-                size_t start = rhs.m_compIndex[j];  //ColLocation
-                size_t end = rhs.m_compIndex[j+1];
-                for(size_t p = start; p < end; p++)
-                { 
-                    size_t i = rhs.m_unCompIndex[p]; //RowLocation
-                    ElemType val = rhs.m_pArray[p];
-
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
-                    {
-                        c(h,j) += alpha * lhs(h, i)*val; 
-                    }
-                }
-            }
-        }
-        else if (!transposeA && transposeB)
-        {           
-            for(size_t j = 0; j < rhs.GetNumCols(); j++)
-            { 
-                size_t start = rhs.m_compIndex[j];
-                size_t end = rhs.m_compIndex[j + 1];
-
-                for(size_t p = start; p < end; p++)
-                { 
-                    size_t i = rhs.m_unCompIndex[p];
-                    ElemType val = rhs.m_pArray[p];
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
-                    {                     
-                        c(h, i) += alpha * lhs(h, j)*val;
-                    }
-                }
-            }           
-        }
-        else if (transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    //c = alpha * op(lhs) * op(rhs)
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
-        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty())
-            throw std::logic_error("LeftMultiplyAndAdd:  one of the input matrix is empty.");
-
-        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
-        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
-        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
-        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
-
-        assert (m>0 && k>0 && l>0 && n>0); m; n;  //converting from size_t to int may cause overflow
-        assert (k == l);
-        if (k != l) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match.");
-        }
-
-        c.Reset();
-
-        if (!transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else if (!transposeA && transposeB)
-        {           
-            if (rhs.GetFormat() != matrixFormatSparseCSC)
-                NOT_IMPLEMENTED;
-
-            //allocate enough memory
-            c.SetFormat(matrixFormatSparseBlockCol);
-            c.Resize(m, n, m*min(n, rhs.m_nz));
-
-            map<size_t, size_t> w2Id;
-            for(size_t j = 0; j < rhs.GetNumCols(); j++)
-            { // j ranges over batches
-                size_t start = rhs.m_compIndex[j];
-                size_t end = rhs.m_compIndex[j+1];
-
-                for(size_t p = start; p < end; p++) 
-                { 
-                    size_t i = rhs.m_unCompIndex[p]; //i ranges over words
-                    ElemType val = rhs.m_pArray[p]; //1 for(i, j)
-
-                    bool first = true;
-                    if(w2Id.find(i) == w2Id.end()) 
-                    {
-                        w2Id[i] = w2Id.size();
-                        c.m_blockIds[c.m_blockSize]=i;
-                        c.m_blockSize++;
-                    } 
-                    else 
-                    {
-                        first = false;
-                    }
-                    size_t pos = w2Id[i] * lhs.GetNumRows();
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++) 
-                    { // h range over hidden layer 
-                        if(first == true) 
-                        {
-                            c.m_pArray[pos] = alpha*lhs(h, j)*val;
-                        } else 
-                        {
-                            c.m_pArray[pos] += alpha*lhs(h, j)*val;
-                        }
-                        pos++;
-                    }
-                }
-            }   
-            c.m_nz = c.m_blockSize * m;
-            if(c.m_nz > c.GetSizeAllocated()) 
-            {
-                throw std::logic_error("sparse matrix out of range.");
-            }
-            //c.SetFormat(matrixFormatSparseBlockCol);
-        }
-        else if (transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty()) 
-        {
-            throw std::logic_error("ScaleAndAdd:  one of the input matrix is empty.");
-        }
-
-        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::ScaleAndAdd: The dimensions of a and b must match.");
-        }
-
-        if(lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSC || lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            size_t col_num = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? lhs.GetNumCols(): lhs.GetNumRows();
-            for(size_t j = 0; j < col_num; j++) 
-            {
-                size_t start = lhs.m_compIndex[j];
-                size_t end = lhs.m_compIndex[j + 1];
-                for(size_t p = start; p < end; p++) 
-                {
-                    size_t i = lhs.m_unCompIndex[p];
-                    ElemType val = lhs.m_pArray[p];
-                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
-                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
-                    rhs(r, c) += alpha * val; 
-                }
-            }
-        } 
-        else if (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol || lhs.m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < lhs.m_blockSize; j++) 
-            {
-                size_t i = lhs.m_blockIds[j];
-                size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols();
-                size_t start = j * len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = lhs.m_pArray[p];
-
-                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    rhs(r, c) += alpha * val; 
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: ScaleAndAdd() Not implemented");
-        }
-    }
-
-
-    template<class ElemType>
-    bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AreEqual: one of the input matrices is empty.");
-
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            return false;
-
-        bool result = true;
-
-        #pragma omp parallel for
-        foreach_coord(i, j, a)
-        {
-            if (abs(a(i, j) - b(i, j)) > threshold)
-            {
-                result = false;
-                break;
-            }
-        }
-
-        return result;
-    }
-
-    // a: H x No: H is hidden layer size and No is mini-batch size
-    // weight: V x H, V is vocab size
-    // label: V x No
-    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
-    // idx2cls: V x 1, mapping from word to class id
-    // etp: V x No, stores predicted values
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropy(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& weight,
-        const CPUSparseMatrix<ElemType> & label, const CPUMatrix<ElemType>& cls, 
-        const CPUMatrix<ElemType>& idx2cls, CPUSparseMatrix<ElemType>& etp, CPUMatrix<ElemType>& entropyScore)
-    {
-        if (a.IsEmpty() || cls.IsEmpty() || label.IsEmpty() || idx2cls.IsEmpty())
-            throw std::logic_error("AssignSoftmaxOf: Matrix a, class, idx2cls or label is empty.");
-
-        if(etp.GetFormat() != MatrixFormat::matrixFormatSparseCSC)
-            throw std::runtime_error("CPUSparseMatrix:: ClassEntropy() only support CSC");  
-
-        size_t nC = cls.GetNumCols();
-        size_t nV = label.GetNumRows() - nC;
-
-        if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows())
-            throw std::logic_error("ClassEntropy: check matrix dimension");
-        
-        //allocate enough memory
-        if(etp.m_elemSizeAllocated < etp.GetNumElements()) 
-        {
-            etp.Resize(etp.GetNumRows(), etp.GetNumCols(), etp.GetNumElements(), true, false);
-        }
-        etp.Reset();
-
-        entropyScore(0, 0) = 0;
-        for(size_t j = 0; j < label.GetNumCols(); j++)
-        {
-            size_t start = label.m_compIndex[j];
-            size_t end = label.m_compIndex[j + 1];
-            for (size_t p = start; p < end; p++)
-            {
-                size_t i = label.m_unCompIndex[p];
-                size_t iStt, iEnd;
-                if (i < nV)
-                {
-                    size_t clsid = (size_t)idx2cls(i, 0);
-                    iStt = (size_t) cls(0, clsid); //class start word id
-                    iEnd = (size_t) cls(1, clsid); //class end word id
-                }
-                else
-                {
-                    iStt = nV;
-                    iEnd = nV + nC;
-                }
-
-                size_t b = etp.m_nz;
-                for(size_t ii = iStt; ii < iEnd; ii++) //ii ranges over sub-vocab or class ids
-                {
-                    ElemType val = 0.0;
-                    foreach_row(rw, a) //rw ranges over hidden units
-                    {
-                        val += weight(ii,rw) * a(rw,j); 
-                    }
-                    etp.SetValue(ii, j, val); 
-                }
-                ElemType maxV = LZERO;
-                for(size_t ii = b; ii < etp.m_nz; ii++)
-                {
-                    maxV = (ElemType) logadd(maxV, etp.m_pArray[ii]);
-                }
-
-                for(size_t ii = b; ii < etp.m_nz; ii++)
-                {
-                    etp.m_pArray[ii] = etp.m_pArray[ii] - maxV;
-                }
-
-                entropyScore(0, 0) -= etp.m_pArray[b+i-iStt];
-                //negate positive data points
-                etp.m_pArray[b+i-iStt] *=-1;
-            }
-        }
-    }
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyError(CPUSparseMatrix<ElemType>& a)
-    {        
-        for(int i = 0; i < a.m_nz; i++) 
-        {
-            if(a.m_pArray[i] < 0) 
-            {
-                a.m_pArray[i] = exp(a.m_pArray[i]); //negative;
-            } 
-            else 
-            { 
-                a.m_pArray[i] = exp(-a.m_pArray[i])-1; //positive
-            }
-        }       
-    }
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(
-        const CPUSparseMatrix<ElemType>& error, 
-        const CPUMatrix<ElemType>& weight,
-        CPUMatrix<ElemType>& grd) 
-    {
-        grd.SetValue(0);
-
-        for(size_t j = 0; j < error.GetNumCols(); j++) 
-        {
-            size_t start = error.m_compIndex[j];
-            size_t end = error.m_compIndex[j+1];
-            for(size_t p = start; p < end; p++)
-            {
-                size_t i = error.m_unCompIndex[p];
-                for(size_t h = 0; h < grd.GetNumRows(); h++)
-                { // h ranges over hidden units
-                    grd(h,j) += weight(i, h) * error.m_pArray[p];
-                }
-            }
-        }
-    }
-
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(
-        const CPUSparseMatrix<ElemType>& error, 
-        const CPUMatrix<ElemType>& input,
-        const CPUSparseMatrix<ElemType> & /*label*/,
-        const CPUMatrix<ElemType>& /*cls*/, 
-        const CPUMatrix<ElemType>& /*idx2cls*/,
-        CPUSparseMatrix<ElemType>& grd) 
-    {   
-        grd.SetFormat(matrixFormatSparseBlockRow);
-        //allocate enough memory
-        grd.Resize(grd.GetNumRows(), grd.GetNumCols(), error.m_nz*input.GetNumRows(), true, false);
-
-        grd.Reset();
-        map<size_t, size_t> w2Id;
-        for(size_t j = 0; j < error.GetNumCols(); j++)
-        {
-            size_t start = error.m_compIndex[j];
-            size_t end = error.m_compIndex[j+1];
-
-            for(size_t p = start; p < end; p++)
-            {
-                size_t i = error.m_unCompIndex[p]; // i ranges over words
-                bool first = true;
-                if(w2Id.find(i) == w2Id.end()) 
-                {
-                    w2Id[i] = w2Id.size();
-                    grd.m_blockIds[grd.m_blockSize]=i;
-                    grd.m_blockSize++;
-                } 
-                else 
-                {
-                    first = false;
-                }
-                size_t pos = w2Id[i]*input.GetNumRows();
-                for(size_t h = 0; h < input.GetNumRows(); h++)
-                { // h range over hidden layer 
-                    if(first == true) 
-                    {
-                        grd.m_pArray[pos] = input(h, j)*error.m_pArray[p];
-                    } 
-                    else 
-                    {
-                        grd.m_pArray[pos] += input(h, j)*error.m_pArray[p];
-                    }
-                    pos++;
-                }
-            }
-        }
-        grd.m_nz = grd.m_blockSize * input.GetNumRows();
-        if(grd.m_nz > grd.GetSizeAllocated()) 
-        {
-            throw std::logic_error("sparse matrix out of range.");
-        }
-        //grd.SetFormat(matrixFormatSparseBlockRow);
-    }
-
-    // normal update for smoothed gradients c and current gradients (this)
-    template<class ElemType> 
-    void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum)
-    {
-        if (c.IsEmpty())
-        {
-            c.Resize(GetNumRows(), GetNumCols());
-            c.SetValue(0.0);
-        }
-
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t i = m_blockIds[j];
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = m_pArray[p];
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    c(row, col) = (1-momentum)*val + momentum*c(row, col);
-                    m_pArray[p] = c(row, col);
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: NormalGrad() only support block sparse format");
-        }
-    }
-
-    // update smoothed gradients c and current gradients (this)
-    template<class ElemType> 
-    void CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c)
-    {
-        if (c.IsEmpty())
-        {
-            c.Resize(GetNumRows(), GetNumCols());
-            c.SetValue(0.0);
-        }
-
-        const ElemType floor = 1e-16f;
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            size_t col_num = (m_format == MatrixFormat::matrixFormatSparseCSC) ? GetNumCols() : GetNumRows();
-            for(size_t j = 0; j < col_num; j++) 
-            {
-                size_t start = m_compIndex[j];
-                size_t end = m_compIndex[j+1];
-                for(size_t p = start; p < end; p++) 
-                {
-                    size_t i = m_unCompIndex[p];
-                    ElemType val = m_pArray[p];
-
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
-                    ElemType adenorm = c(row, col); 
-                    adenorm += val * val; 
-                    val = val / (floor + sqrt(adenorm)); 
-                    m_pArray[p] = val;
-                    c(row, col) = adenorm; 
-                }
-            }
-        } else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < m_blockSize; j++)
-            {
-                size_t i = m_blockIds[j];
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = m_pArray[p];
-
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    ElemType adenorm = c(row, col); 
-                    adenorm += val * val; 
-                    val = val / (floor + sqrt(adenorm)); 
-                    m_pArray[p] = val;
-                    c(row, col) = adenorm; 
-                }
-            }
-        } 
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
-    {
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            ElemType locThresholdPos = abs(threshold);
-            ElemType locTHresholdNeg = -locThresholdPos; 
-
-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for (size_t p = start; p < start+len; p++)
-                {
-                    if (m_pArray[p] > locThresholdPos)
-                    {
-                        m_pArray[p] = locThresholdPos;
-                    }
-                    else if (m_pArray[p] < locTHresholdNeg)
-                    {
-                        m_pArray[p] = locTHresholdNeg;
-                    }
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix");
-        }
-        return *this;
-    }    
-
-    template <class ElemType>
-    MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
-    {
-        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        size_t elsize;
-        stream >> elsize;
-        if (sizeof(ElemType) != elsize)
-            throw std::runtime_error("Template argument size doesn't match those in file");
-        std::wstring matrixName;
-
-        // now prepare this header to receive the data being read
-        size_t nz, colnum, rownum;
-        int format;
-
-        // read in the header information
-        stream >> matrixName >> format >> nz >> colnum >> rownum;
-
-        us.SetFormat((MatrixFormat)format);
-        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        us.Resize(rownum, colnum, nz);
-
-        if (nz > 0)
-        {
-            size_t compressedSize = (us.GetFormat() == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1;
-            ElemType* dataBuffer = us.NzValues();
-            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
-            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
-
-            // read in the sparse matrix info
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream >> dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream >> unCompressedIndex[i];
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                stream >> compressedIndex[i];
-            }
-        }
-        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-
-        us.SetMatrixName(matrixName.c_str());
-
-        return stream;
-    }
-
-    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
-    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);
-
-    template <class ElemType>
-    MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
-    {
-        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        stream << sizeof(ElemType);
-        if (us.GetMatrixName() == nullptr)
-        {
-            std::wstring s(L"nnmatrix");
-            stream << s;
-        }
-        else
-        {
-            stream << us.GetMatrixName();
-        }
-
-        size_t nz, numRows, numCols;
-        size_t compressedSize = us.SecondaryIndexCount();
-        int format = us.GetFormat();
-
-        stream << format << nz << numCols << numRows;
-
-        if (nz > 0)
-        {
-            ElemType* dataBuffer = us.NzValues();
-            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
-            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
-
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream << dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream << unCompressedIndex[i];
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                stream << compressedIndex[i];
-            }
-        }
-        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-
-        return stream;
-    }
-
-    template class CPUSparseMatrix<float>;
-    template class CPUSparseMatrix<double>;
-
-}}}
->>>>>>> origin/master
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -1,5 +1,4 @@
-<<<<<<< HEAD
-//
+//
 // <copyright file="GPUSparseMatrix.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
@ -24,8 +23,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
    {
-        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
-
+	public:
+        typedef BaseMatrix<ElemType> B; 
+		using B::m_numRows; 
+		using B::m_numCols; 
+		using B::m_pArray; 
+		using B::m_elemSizeAllocated; 
+		using B::m_nz; 
+		using B::m_format;   
+		using B::m_computeDevice;
+		using B::m_externalBuffer;
+		using B::m_matrixName;
+		using B::OwnBuffer;
+		using B::GetFormat;
+		using B::SetFormat;
+		using B::GetNumRows;
+		using B::GetNumCols;
+		using B::IsEmpty;
+		using B::SetComputeDeviceId;
+		using B::SetMatrixName;
+		using B::SetNzCount;
+		// without this, base members would require to use thi-> in GCC
    public:
        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);

@ -264,270 +282,3 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    };
 }}}    

-=======
-//
-// <copyright file="GPUSparseMatrix.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "GPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include <functional>
-
-namespace Microsoft { namespace MSR { namespace CNTK {    
-
-    //GPU Sparse Matrix, using cuSPARSE library.
-    //By default we are assuming CSR representation
-    // NOTE m_elemSizeAllocated (in base matrix) means the number of non-zero elements we have allocated space
-    // We are packing the CSR format (pointed to by m_pArray) as follows:
-    // ElemType elements[m_elemSizeAllocated]
-    // int colIdx[m_elemSizeAllocated]
-    // int rowIdxStart[m_numRows+1]
-
-    template<class ElemType>
-    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
-    {
-        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
-
-    public:
-        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-
-        GPUSparseMatrix(const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR,
-            const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-    
-        GPUSparseMatrix(const GPUSparseMatrix<ElemType>&);
-
-        GPUSparseMatrix(const GPUMatrix<ElemType>&, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
-
-#ifndef    LINUX
-        GPUSparseMatrix(GPUSparseMatrix<ElemType>&&);
-#endif    /* LINUX */
-
-        ~GPUSparseMatrix();
-
-    public:
-        void Reset();
-
-    public:
-        // return col pointer, which is immediately following the non-zero element
-        // in memory format is always in the following order:
-        // Non-zero data elements, Full index locations, compressed index locations
-        // In CSR row data is compressed, in CSC col data is compressed
-        inline const ElemType* NzValues() const {return m_pArray;}
-        inline ElemType* NzValues() {return m_pArray;}
-        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
-
-        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
-        size_t MajorIndexCount() const { return m_nz; }
-        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
-
-        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
-        size_t SecondaryIndexCount(const size_t numNZ) const 
-        {
-            if (m_format&matrixFormatCompressed)
-            {
-                size_t cnt = (m_format&matrixFormatRowMajor)?m_numRows:m_numCols;
-                if (cnt > 0) cnt++; // add an extra element on the end for the "max" value
-                return cnt;
-            }
-            else
-                return numNZ; // COO format
-        }
-
-        size_t SecondaryIndexCount() const
-        {
-            return SecondaryIndexCount(m_nz);
-        }
-
-        // get size for compressed index
-        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
-
-        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
-        size_t BufferSizeNeeded(const size_t numNZ) const 
-        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
-
-        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
-        inline ElemType* BufferPointer() const { return m_pArray; }
-
-        // the column and row locations will swap based on what format we are in. Full index always follows the data array
-        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
-        size_t RowSize() const {return (m_format&matrixFormatRowMajor)?SecondaryIndexSize():MajorIndexSize();} 
-        GPUSPARSE_INDEX_TYPE* ColLocation() const { return (m_format&matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation(); }
-        size_t ColSize() const {return (m_format&matrixFormatRowMajor)?MajorIndexSize():SecondaryIndexSize();} // actual number of bytes in use
-
-        void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix);
-
-        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
-
-        GPUSparseMatrix<ElemType> Transpose() const;
-        void InplaceTranspose();
-        GPUSparseMatrix<ElemType>& AssignTransposeOf(const GPUSparseMatrix<ElemType>& a);
-
-        GPUMatrix<ElemType> CopyToDenseMatrix() const;
-        void CopyToDenseMatrix(GPUMatrix<ElemType> &denseMatrix) const;
-        void CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const;
-        void ChangeDeviceTo(DEVICEID_TYPE toId);
-
-        GPUSparseMatrix<ElemType>& operator=(const GPUSparseMatrix<ElemType>& deepCopy);
-#ifndef    LINUX
-        GPUSparseMatrix<ElemType>& operator=(GPUSparseMatrix<ElemType>&& moveFrom);
-#endif    /* LINUX */
-        GPUSparseMatrix<ElemType> operator+ (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType> operator- (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType>& operator^= (const ElemType alpha); //element-wise power        
-        GPUSparseMatrix<ElemType> operator^ (const ElemType alpha) const; //element-wise power
-        GPUSparseMatrix<ElemType>& operator*= (const ElemType alpha);
-        GPUSparseMatrix<ElemType> operator*(const ElemType alpha) const;
-        GPUSparseMatrix<ElemType>& AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power);        
-
-        bool IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-    public:
-        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
-        inline size_t GetNumNZElements() const {return m_nz;}
-
-        //Sets sparse matrix in CSR format. this acts as deep copy
-        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
-        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
-        void GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void ConvertToSparseFormat(MatrixFormat newFormat);
-        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
-
-    public:
-        GPUSparseMatrix<ElemType>& ElementInverse ();
-        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLinearRectifierDerivative();
-        GPUSparseMatrix<ElemType>& AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSigmoid ();
-        GPUSparseMatrix<ElemType>& AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTanh ();
-        GPUSparseMatrix<ElemType>& AssignTanhOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSqrt ();
-        GPUSparseMatrix<ElemType>& AssignSqrtOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceExp ();
-        GPUSparseMatrix<ElemType>& AssignExpOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLog ();
-        GPUSparseMatrix<ElemType>& AssignLogOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceAbs ();   
-        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-        GPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
-
-        ElemType SumOfElements () const; //sum of all elements
-        ElemType SumOfAbsElements () const; //sum of all abs(elements)
-        ElemType FrobeniusNorm() const;
-        ElemType MatrixNormInf() const;
-        ElemType MatrixNorm1() const;
-        ElemType MatrixNorm0() const { return (ElemType)GetNumNZElements(); };
-    public:        
-        //Performs C = alpha ∗ op ( S ) ∗ D + beta ∗ C; Where S is sparse and D and C are dense
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, 
-            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& S, const bool transposeS, const GPUMatrix<ElemType>& D, 
-            const bool transposeD, ElemType beta, GPUMatrix<ElemType>& C);
-        static void MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, 
-            const bool transposeB, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
-        
-        static void ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
-            const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-            const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore);
-        static void ClassEntropyError(GPUSparseMatrix<ElemType>& a);
-        static void ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd);
-        static void ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd);
-
-        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
-        
-        static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &C);
-        GPUSparseMatrix<ElemType>& AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB);
-
-        static ElemType InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static ElemType InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a);
-        static void ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-
-        //For these two, I should also add a version which would return GPUSparseMatrix, since Dense.*Sparse =Sparse.*Dense=Sparse
-        static GPUMatrix<ElemType> ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static GPUMatrix<ElemType> ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);     
-
-    public:
-        // See: http://stackoverflow.com/questions/4660123/overloading-friend-operator-for-template-class/4661372#4661372
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemTypeDummy>& us);
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemTypeDummy>& us);
-
-     private:
-         void* ReserveTempHostBuffer(const size_t sizeInByte) const;
-         template <class OutType, class InType>
-         static void CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size);
-    private:
-        void ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE deviceId);
-
-    private:
-        void performInplaceFunction(const int kind);
-        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void Clear();
-        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
-        size_t ElemCountFromBufferSize(const size_t totalBufferSize) const;
-        size_t ElemCountFromBufferSize() const;
-        DEVICEID_TYPE PrepareDevice(const DEVICEID_TYPE deviceId = -1) const;
-
-     private:
-
-        size_t m_totalBufferSizeAllocated;
-
-        size_t m_blockSize; //block size        
-        size_t *m_blockIds; //block ids
-        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
-
-        size_t m_expandedSize; // expanded label size
-        size_t* m_block2Id; // label block id to first word location
-        size_t* m_block2UniqId; // label block id to unique first word location        
-
-        mutable void* m_tempHostBuffer; //used to copy values.
-        mutable size_t m_tempHostBufferSize;
-
-        static bool do_sync; 
-    };
-}}}    
-
->>>>>>> origin/master