CNTK/MachineLearning/cn/SimpleNetworkBuilder.h

//
// <copyright file="SimpleNetworkBuilder.h" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//
#pragma once

#include "basetypes.h"
#include "ComputationNetwork.h"
#include "IComputationNetBuilder.h"
#include <string>
#include "commandArgUtil.h"
#include "Matrix.h"
#include <stdexcept>
#include <regex>
#include "BestGpu.h"

#pragma warning (disable: 4661)

using namespace std;

namespace Microsoft { namespace MSR { namespace CNTK {

#define MAX_DEPTH 20

    typedef enum tpRNNType { SIMPLENET=0, /// no recurrent connections
            SIMPLERNN = 1, LSTM=2, DEEPRNN=4, CLASSLM = 8,
            LBLM=16,
            NPLM=32, CLASSLSTM=64, TENSORIOLSTM=128} RNNTYPE;

    enum class TrainingCriterion : int
    {
        CrossEntropyWithSoftmax,
        CrossEntropy,
        SquareError,
        ClassCrossEntropyWithSoftmax
    };

    enum class EvalCriterion : int
    {
        CrossEntropyWithSoftmax,
        CrossEntropy,
        SquareError,
        ErrorPrediction,
        ClassCrossEntropyWithSoftmax
    };

    extern TrainingCriterion ParseTrainingCriterionString(wstring s);
    extern EvalCriterion ParseEvalCriterionString(wstring s);

    template<class ElemType>
    class SimpleNetworkBuilder : public IComputationNetBuilder<ElemType>
    {
    protected:
        typedef ComputationNode<ElemType>* ComputationNodePtr;

    private:
        SimpleNetworkBuilder() //disable default constructor from being called
        {
        }

    public:
        SimpleNetworkBuilder(const ConfigParameters& config) : m_net(nullptr)
        {
            Init(config);
        }

        // full parameter Init routine
        void Init(const intargvector& layerSizes, const TrainingCriterion trainCriterion, const EvalCriterion evalCriterion, const stringargvector nonLinearFunctions=L"Sigmoid",
            const bool addDropoutNodes=false,
            const bool uniformInit = true, const ElemType initValueScale = 1.0f,
            const bool applyMeanVarNorm = false, bool needPrior = false, DEVICEID_TYPE deviceId = AUTOPLACEMATRIX)
        {
            m_deviceId=deviceId;
            m_net = new ComputationNetwork<ElemType>(m_deviceId);

            m_layerSizes=layerSizes;
            m_applyMeanVarNorm=applyMeanVarNorm;
            m_trainCriterion=trainCriterion;
            m_evalCriterion=evalCriterion;
            m_addDropoutNodes=addDropoutNodes;
            m_needPrior=needPrior;
            m_nonLinearFunctions=nonLinearFunctions;
            m_uniformInit=uniformInit;
            m_initValueScale=initValueScale;
            if (m_layerSizes.size() < 2)
                throw std::invalid_argument("A network should have at least two layers (one input and one output)");

            if (m_deviceId == AUTOPLACEMATRIX)
                m_deviceId = Matrix<ElemType>::GetBestGPUDeviceId();

            m_net->SetDeviceID(m_deviceId);
            if (m_deviceId < 0)
                fprintf(stderr,"SimpleNetworkBuilder Using CPU\n");
            else
                fprintf(stderr,"SimpleNetworkBuilder Using GPU %d\n", m_deviceId);

        }

        virtual void InitRecurrentConfig(const ConfigParameters& config)
        {
            ConfigArray rLayerSizes = config("recurrentLayer", "");
            intargvector recurrentLayers = rLayerSizes;
            m_recurrentLayers=recurrentLayers;
            m_defaultHiddenActivity = config("defaultHiddenActivity", "0.1");
            ConfigArray str_rnnType = config("rnnType", L"SIMPLENET");

            m_maOrder = config("maOrder", "0");
            m_lookupTableOrder = config("lookupTableOrder","0");
            m_labelEmbeddingSize = config("labelEmbeddingSize","10");
            m_constForgetGateValue = config("constForgetGateValue","false");
            m_constInputGateValue = config("constInputGateValue","false");
            m_constOutputGateValue = config("constOutputGateValue","false");

            m_forgetGateInitVal = config("forgetGateInitVal", "-1");
            m_inputGateInitVal = config("inputGateInitVal", "-1");
            m_outputGateInitVal = config("outputGateInitVal", "-1");

            stringargvector strType = str_rnnType;
            if (std::find(strType.begin(), strType.end(), L"SIMPLERNN") != strType.end())
                m_rnnType = SIMPLERNN;
            if (std::find(strType.begin(), strType.end(), L"LSTM")!= strType.end())
                m_rnnType = LSTM;
            if (std::find(strType.begin(), strType.end(), L"DEEPRNN")!= strType.end())
                m_rnnType = DEEPRNN;
            if (std::find(strType.begin(), strType.end(), L"CLASSLM")!= strType.end())
                m_rnnType = CLASSLM;
            if (std::find(strType.begin(), strType.end(), L"LBLM") != strType.end())
                m_rnnType= LBLM;
            if (std::find(strType.begin(), strType.end(), L"NPLM") != strType.end())
                m_rnnType= NPLM;
            if (std::find(strType.begin(), strType.end(), L"CLASSLSTM") != strType.end())
                m_rnnType= CLASSLSTM;
            if (std::find(strType.begin(), strType.end(), L"TENSORIOLSTM") != strType.end())
                m_rnnType= TENSORIOLSTM;
        }

        // Init - Builder Initialize for multiple data sets
        // config - [in] configuration parameters for the network builder
        virtual void Init(const ConfigParameters& config)
        {
            DEVICEID_TYPE deviceId = DeviceFromConfig(config);

            ElemType initValueScale = config("initValueScale", "1.0");

            ConfigArray layerTypes = config("layerTypes", L"Sigmoid");
            stringargvector nonlinearFunctions = layerTypes;

            bool uniformInit = config("uniformInit", "true");
            bool applyMeanVarNorm = config("applyMeanVarNorm", "false");
            bool needPrior = config("needPrior", "false");

            bool addDropoutNodes = config("addDropoutNodes", "false");

            ConfigArray layerSizes;
            intargvector layers;
            TrainingCriterion trainingCriterion;
            EvalCriterion evalCriterion;

            layerSizes = config("layerSizes","100");
            layers = layerSizes;
            trainingCriterion = ParseTrainingCriterionString(config("trainingCriterion"));
            evalCriterion = ParseEvalCriterionString(config("evalCriterion"));

            ConfigArray rDirect = config("directConnect", "");
            m_directConnect = rDirect;

            Init(layers, trainingCriterion, evalCriterion, nonlinearFunctions, addDropoutNodes,
                uniformInit, initValueScale, applyMeanVarNorm, needPrior, deviceId);

            InitRecurrentConfig(config);
        }

        virtual ~SimpleNetworkBuilder()
        {
            delete m_net;
        }

        virtual ComputationNetwork<ElemType>& LoadNetworkFromFile(const wstring& modelFileName, bool forceLoad = true)
        {
            if (m_net->GetTotalNumberOfNodes() == 0 || forceLoad) //not built or force load
            {
                bool isDBN = false;

                {  //force fstream to close when out of range
                    File fstream(modelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
                    isDBN = CheckDbnTag(fstream,"DBN\n");
                }

                if (isDBN)
                {
                    BuildNetworkFromDbnFile(modelFileName);
                }
                else
                {
                    m_net->LoadFromFile(modelFileName);
                }
            }

            m_net->ResetEvalTimeStamp();
            return *m_net;
        }

        ComputationNetwork<ElemType>& BuildNetworkFromDescription()
        {
            size_t mbSize = 1;

            if (m_rnnType == SIMPLERNN)
                return BuildSimpleRNN(mbSize);
            if (m_rnnType == LSTM)
                return BuildLSTMNetworkFromDescription(mbSize);
            if (m_rnnType == CLASSLSTM)
                return BuildCLASSLSTMNetworkFromDescription(mbSize);
            if (m_rnnType == CLASSLM)
                return BuildClassEntropyNetwork(mbSize);
            if (m_rnnType == LBLM)
                return BuildLogBilinearNetworkFromDescription(mbSize);
            if (m_rnnType == NPLM)
                return BuildNeuralProbNetworkFromDescription(mbSize);
            if (m_rnnType == TENSORIOLSTM)
                return BuildLSTMInputOutputTensorNetworkFromDescription(mbSize);


            if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
            {
                unsigned long randomSeed = 1;

                size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process

                size_t numHiddenLayers = m_layerSizes.size()-2;
                ComputationNodePtr input=nullptr, w=nullptr, b=nullptr, output=nullptr, label=nullptr, prior=nullptr, scaledLogLikelihood=nullptr;

                input = m_net->Input(m_layerSizes[0], mbSize, L"features");
                m_net->FeatureNodes().push_back(input);

                if (m_applyMeanVarNorm)
                {
                    w = m_net->Mean(input, L"MeanOfFeatures");
                    b = m_net->InvStdDev(input, L"InvStdOfFeatures");
                    output = m_net->PerDimMeanVarNormalization(input, w, b, L"MVNormalizedFeatures");

                    input = output;
                }

                if (numHiddenLayers > 0)
                {
                    w = m_net->Parameter( m_layerSizes[1], m_layerSizes[0], L"W0");
                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                    b = m_net->Parameter(m_layerSizes[1], 1, L"B0");
                    output = ApplyNonlinearFunction(m_net->Plus(m_net->Times(w, input, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");

                    if (m_addDropoutNodes)
                        input = m_net->Dropout(output, L"DropH1");
                    else
                        input = output;

                    for (int i=1; i<numHiddenLayers; i++)
                    {
                        wstring nameOfW = msra::strfun::wstrprintf (L"W%d", i);
                        wstring nameOfB = msra::strfun::wstrprintf (L"B%d", i);
                        wstring nameOfPrevH = msra::strfun::wstrprintf (L"H%d", i);
                        wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
                        wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
                        wstring nameOfH = msra::strfun::wstrprintf (L"H%d", i+1);

                        w = m_net->Parameter(m_layerSizes[i+1], m_layerSizes[i], nameOfW);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                        b = m_net->Parameter(m_layerSizes[i+1], 1, nameOfB);
                        output = ApplyNonlinearFunction(m_net->Plus(m_net->Times(w, input, nameOfTimes), b, nameOfPlus), i, nameOfH);

                        if (m_addDropoutNodes)
                            input = m_net->Dropout(output, L"Drop" + nameOfH);
                        else
                            input = output;
                    }
                }

                wstring nameOfW = msra::strfun::wstrprintf (L"W%d", numHiddenLayers);
                wstring nameOfB = msra::strfun::wstrprintf (L"B%d", numHiddenLayers);
                wstring nameOfPrevH = msra::strfun::wstrprintf (L"H%d", numHiddenLayers-1);
                wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
                wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;

                w = m_net->Parameter(m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers], nameOfW);
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                b = m_net->Parameter(m_layerSizes[numHiddenLayers+1], 1, nameOfB);
                output = m_net->Plus(m_net->Times(w, input, nameOfTimes), b, nameOfPlus);
                m_net->RenameNode(output, L"HLast");

                label = m_net->Input(m_layerSizes[numHiddenLayers+1], mbSize, L"labels");

                AddTrainAndEvalCriterionNodes(output, label);

                if (m_needPrior)
                {
                    prior = m_net->Mean(label, L"Prior");
                    input = m_net->Log(prior, L"LogOfPrior");

                    //following two lines are needed only if true probability is needed
                    //output = m_net->Softmax(output);
                    //output = m_net->Log(output);

                    scaledLogLikelihood = m_net->Minus(output, input, L"ScaledLogLikelihood");
                    m_net->OutputNodes().push_back(scaledLogLikelihood);
                }
                else
                {
                    m_net->OutputNodes().push_back(output);
                }

                //add softmax layer (if prob is needed or KL reg adaptation is needed)
                output = m_net->Softmax(output, L"PosteriorProb");
                //m_net->OutputNodes().push_back(output);
            }

            m_net->ResetEvalTimeStamp();
            return *m_net;
        }

        RNNTYPE RnnType(){ return m_rnnType;}

    protected:

        ComputationNetwork<ElemType>& BuildSimpleRNN(size_t mbSize  = 1);

        ComputationNetwork<ElemType>& BuildClassEntropyNetwork(size_t mbSize = 1);

        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, bool inputWeightSparse = false);

        ComputationNode<ElemType>* BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);

        ComputationNetwork<ElemType>& BuildLogBilinearNetworkFromDescription(size_t mbSize = 1);

        ComputationNetwork<ElemType>& BuildNeuralProbNetworkFromDescription(size_t mbSize = 1);

        ComputationNetwork<ElemType>& BuildLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetwork<ElemType>& BuildCLASSLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetwork<ElemType>& BuildLSTMInputOutputTensorNetworkFromDescription(size_t mbSize = 1);

        ComputationNetwork<ElemType>& BuildNetworkFromDbnFile(const std::wstring& dbnModelFileName)
        {

            std::string hdr,comment,name;
            int version;
            int numLayers,i;
            std::string layerType;

            unsigned long randomSeed = 1;

            ComputationNodePtr input=nullptr, w=nullptr, b=nullptr, output=nullptr, label=nullptr, prior=nullptr, scaledLogLikelihood=nullptr;
            PreComputedNode<ElemType>* pcNodePtr=nullptr;
            size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process

            File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);

            if (!CheckDbnTag(fstream,"DBN\n"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag DBN\n");
            fstream >> comment;
            if (!CheckDbnTag(fstream,"BDBN"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag BDBN\n");
            fstream >> version >> numLayers;

            Matrix<ElemType> globalMean = ReadMatrixFromDbnFile(fstream,std::string("gmean"));
            Matrix<ElemType> globalStdDev = ReadMatrixFromDbnFile(fstream,std::string("gstddev"));
            assert(globalMean.GetNumCols()==1);
            assert(globalStdDev.GetNumCols()==1);

            //move to CPU since element-wise operation is expensive and can go wrong in GPU
            int curDevId = globalStdDev.GetDeviceId();
            globalStdDev.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
            for (int i=0;i<globalStdDev.GetNumRows();i++)
                globalStdDev(i,0)=(ElemType)1.0/(const ElemType)globalStdDev(i,0);
            globalStdDev.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);

            if (!CheckDbnTag(fstream,"BNET"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag BNET\n");

            for (i=0;i<numLayers;i++) //0th index is for input layer,
            {
                fstream >> layerType;

                Matrix<ElemType> wts = ReadMatrixFromDbnFile(fstream,std::string("W"));
                Matrix<ElemType> bias = ReadMatrixFromDbnFile(fstream,std::string("a")); // remnant from pretraining, not needed
                Matrix<ElemType> A = ReadMatrixFromDbnFile(fstream,std::string("b"));
                if (wts.GetNumRows()!=m_layerSizes[i+1] || wts.GetNumCols()!=m_layerSizes[i])
                {
                    std::stringstream msg;
                    msg << "error reading DBN file: mismatch in layer size between dbn file and config specification!" << endl;
                    msg << wts.GetNumRows() << "," << wts.GetNumCols() << "!=" << m_layerSizes[i + 1] << "," << m_layerSizes[i] << endl;

                    throw std::runtime_error(msg.str().c_str());
                }
                if (i==0)
                {
                    input = m_net->Input(wts.GetNumCols(), mbSize, L"features");
                    m_net->FeatureNodes().push_back(input);

                    size_t frameDim = globalMean.GetNumRows();
                    size_t numContextFrames = wts.GetNumCols()/frameDim;
                    size_t contextDim = numContextFrames*frameDim;
                    Matrix<ElemType> contextMean(contextDim, 1, m_deviceId);
                    Matrix<ElemType> contextStdDev(contextDim, 1, m_deviceId);

                    //move to CPU since element-wise operation is expensive and can go wrong in GPU
                    contextMean.TransferFromDeviceToDevice(m_deviceId, CPUDEVICE, true, false, false);
                    contextStdDev.TransferFromDeviceToDevice(m_deviceId, CPUDEVICE, true, false, false);
                    for (size_t j=0;j<frameDim;j++)
                    {
                        for (size_t k=0;k<numContextFrames;k++)
                        {
                            contextMean(j+k*frameDim,0)=(const ElemType)globalMean(j,0);
                            contextStdDev(j+k*frameDim,0)=(const ElemType)globalStdDev(j,0);
                        }
                    }
                    contextMean.TransferFromDeviceToDevice(CPUDEVICE, m_deviceId, true, false, false);
                    contextStdDev.TransferFromDeviceToDevice(CPUDEVICE, m_deviceId, true, false, false);

                    w = m_net->Mean(input, L"MeanOfFeatures");
                    w->FunctionValues().SetValue(contextMean);
                    w->NeedGradient() = false;
                    pcNodePtr = static_cast<PreComputedNode<ElemType>*>(w);
                    pcNodePtr->MarkComputed(true);

                    b = m_net->InvStdDev(input, L"InvStdOfFeatures");
                    b->FunctionValues().SetValue(contextStdDev);
                    b->NeedGradient() = false;
                    pcNodePtr = static_cast<PreComputedNode<ElemType>*>(b);
                    pcNodePtr->MarkComputed(true);

                    output = m_net->PerDimMeanVarNormalization(input, w, b, L"MVNormalizedFeatures");
                    input = output;
                }

                wstring nameOfW = msra::strfun::wstrprintf (L"W%d", i);
                wstring nameOfB = msra::strfun::wstrprintf (L"B%d", i);
                wstring nameOfPrevH = msra::strfun::wstrprintf (L"H%d", i);
                wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
                wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
                wstring nameOfH = msra::strfun::wstrprintf (L"H%d", i+1);

                w = m_net->Parameter(wts.GetNumRows(),wts.GetNumCols(), nameOfW);
                w->FunctionValues().SetValue(wts);

                b = m_net->Parameter(bias.GetNumRows(), 1, nameOfB);
                b->FunctionValues().SetValue(bias);

                if (layerType!="perceptron")
                {
                    output = ApplyNonlinearFunction(m_net->Plus(m_net->Times(w, input, nameOfTimes), b, nameOfPlus), i, nameOfH);
                    if (m_addDropoutNodes)
                        input = m_net->Dropout(output, L"Drop" + nameOfH);
                }
                else
                {
                    output = m_net->Plus(m_net->Times(w, input, nameOfTimes), b, nameOfPlus);
                }

                input = output;
            }

            if (!CheckDbnTag(fstream,"ENET"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag ENET\n");
            size_t outputLayerSize =  m_layerSizes[m_layerSizes.size()-1];
            label = m_net->Input(outputLayerSize, mbSize, L"labels");

            if (layerType == "perceptron") // complete network
            {
                m_net->RenameNode(output, L"HLast");
                assert(numLayers+1==m_layerSizes.size());
                Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream,std::string("Pu"));
                assert(priorVals.GetNumCols()==1 && priorVals.GetNumRows()==outputLayerSize);

                w = m_net->Mean(label, L"Prior");
                w->FunctionValues().SetValue(priorVals);
                w->NeedGradient() = false;
                pcNodePtr = static_cast<PreComputedNode<ElemType>*>(w);
                pcNodePtr->MarkComputed(true);
            }
            else // pretrained network - need to add output layer, initalize
            {
                wstring nameOfW = msra::strfun::wstrprintf (L"W%d", i);
                wstring nameOfB = msra::strfun::wstrprintf (L"B%d", i);
                wstring nameOfPrevH = msra::strfun::wstrprintf (L"H%d", i);
                wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
                wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
                wstring nameOfH = msra::strfun::wstrprintf (L"H%d", i+1);

                assert(numLayers+2==m_layerSizes.size());
                w = m_net->Parameter(m_layerSizes[numLayers+1], m_layerSizes[numLayers], nameOfW);
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                b = m_net->Parameter(m_layerSizes[numLayers+1], 1, nameOfB);
                output = m_net->Plus(m_net->Times(w, input, nameOfTimes), b, nameOfPlus);
                m_net->RenameNode(output, L"HLast");

                if (m_needPrior)
                {
                    Matrix<ElemType> zeros = Matrix<ElemType>::Zeros(m_layerSizes[numLayers+1], 1, m_deviceId);
                    prior = m_net->Mean(label, L"Prior");
                    prior->FunctionValues().SetValue(zeros);
                    pcNodePtr = static_cast<PreComputedNode<ElemType>*>(prior);
                    pcNodePtr->MarkComputed(false);
                }
            }

            AddTrainAndEvalCriterionNodes(output, label);

            if (layerType=="perceptron" || m_needPrior)
            {
                input = m_net->Log(pcNodePtr, L"LogOfPrior");

                //following two lines is needed only if true probability is needed
                //output = m_net->Softmax(output);
                //output = m_net->Log(output);

                scaledLogLikelihood = m_net->CreateComputationNode(MinusNode<ElemType>::TypeName(), L"ScaledLogLikelihood");
                scaledLogLikelihood->AttachInputs(output, input);
                m_net->OutputNodes().push_back(scaledLogLikelihood);
            }
            else
            {
                m_net->OutputNodes().push_back(output);
            }

            if (!CheckDbnTag(fstream,"EDBN"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag ENET\n");
            return *m_net;
        }

        //layer is 0 based
        ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L"")
        {
            ComputationNodePtr output;
            wstring nonLinearFunction = m_nonLinearFunctions[layer];
            if (nonLinearFunction == SigmoidNode<ElemType>::TypeName())
                output = m_net->Sigmoid(input, nodeName);
            else if (nonLinearFunction == RectifiedLinearNode<ElemType>::TypeName())
                output = m_net->RectifiedLinear(input, nodeName);
            else if (nonLinearFunction == TanhNode<ElemType>::TypeName())
                output = m_net->Tanh(input, nodeName);
            else if (nonLinearFunction ==  L"None" || nonLinearFunction == L"none" || nonLinearFunction == L"")
            {
                output = input;  //linear layer
                if (nodeName != L"")
                    m_net->RenameNode(output, nodeName);
            }
            else
                throw std::logic_error("Unsupported nonlinear function.");

            return output;
        }

        void AddTrainAndEvalCriterionNodes(ComputationNodePtr input, ComputationNodePtr label, ComputationNodePtr matrix = nullptr, const std::wstring trainNodeName = L"", const std::wstring evalNodeName = L"")
        {
            m_net->LabelNodes().push_back(label);

            ComputationNodePtr output;
            ComputationNodePtr tinput = input;
            if (matrix != nullptr)
            {
                tinput = m_net->Times(matrix, input);
            }

            switch (m_trainCriterion)
            {
            case TrainingCriterion::CrossEntropyWithSoftmax:
                output = m_net->CrossEntropyWithSoftmax(label, tinput, (trainNodeName == L"")?L"CrossEntropyWithSoftmax":trainNodeName);
                break;
            case TrainingCriterion::SquareError:
                output = m_net->SquareError(label, tinput, (trainNodeName == L"")?L"SquareError":trainNodeName);
                break;
            case TrainingCriterion::ClassCrossEntropyWithSoftmax:
                output = m_net->ClassCrossEntropyWithSoftmax(label, input, matrix, (trainNodeName == L"")?L"ClassCrossEntropyWithSoftmax":trainNodeName);
                break;
         default:
                throw std::logic_error("Unsupported training criterion.");
            }
            m_net->FinalCriterionNodes().push_back(output);

            if (!((m_evalCriterion == EvalCriterion::CrossEntropyWithSoftmax && m_trainCriterion == TrainingCriterion::CrossEntropyWithSoftmax) ||
                (m_evalCriterion == EvalCriterion::SquareError && m_trainCriterion == TrainingCriterion::SquareError) ||
                (m_evalCriterion == EvalCriterion::ClassCrossEntropyWithSoftmax && m_trainCriterion == TrainingCriterion::ClassCrossEntropyWithSoftmax)))
            {
                switch (m_evalCriterion)
                {
                case EvalCriterion::CrossEntropyWithSoftmax:
                    output = m_net->CrossEntropyWithSoftmax(label, tinput, (evalNodeName == L"")?L"CrossEntropyWithSoftmax":evalNodeName);
                    break;
                case EvalCriterion::ClassCrossEntropyWithSoftmax:
                    output = m_net->ClassCrossEntropyWithSoftmax(label, input, matrix, (evalNodeName == L"")?L"ClassCrossEntropyWithSoftmax":evalNodeName);
                    break;
                case EvalCriterion::SquareError:
                    output = m_net->SquareError(label, tinput, (evalNodeName == L"")?L"SquareError":evalNodeName);
                    break;
                case EvalCriterion::ErrorPrediction:
                    output = m_net->ErrorPrediction(label, tinput, (evalNodeName == L"")?L"ErrorPrediction":evalNodeName);
                    break;
                default:
                    throw std::logic_error("Unsupported training criterion.");
                }
            }

            m_net->EvaluationNodes().push_back(output);
       }

        Matrix<ElemType> ReadMatrixFromDbnFile(File &fstream, const std::string expectedName)
        {
            int numRows,numCols;
            std::string name;
            if (!CheckDbnTag(fstream,"BMAT"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag BMAT\n");
            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, "BMAT");
            fstream >> name >> numRows >> numCols;
            if (name != expectedName)
            {
                throw std::invalid_argument(msra::strfun::strprintf("ERROR reading pretrained DBN file, expected name %s, found name %s\n", expectedName.c_str(), name.c_str()));
            }

            if (numCols>1) // transpose W because dbn stores that way apparently
            {
                int origRows = numRows;
                numRows = numCols;
                numCols = origRows;
            }


            Matrix<ElemType> mat(numRows, numCols, m_deviceId);

            // dbn operates on row vectors not column vectors. x*W + b, so need to read in as W'
            //ElemType* d_array = new ElemType[numRows*numCols];
            float tmp;
            for (long i=0;i<numRows;i++)
                for (long j=0;j<numCols;j++)
                {
                    fstream>>tmp;
                    mat(i,j)=tmp;
                //d_array[i] = (ElemType)tmp;
                }
            if (!CheckDbnTag(fstream,"EMAT"))
                throw std::runtime_error("Error reading DBN file - did not find expected tag EMAT\n");
            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, "EMAT");

            return mat;

        }

        bool CheckDbnTag(File &fstream, const std::string expectedTag)
        {
            char tag[5];
            for (int i=0;i<4;i++)
                fstream >> tag[i];
            tag[4] = 0;

            if (std::string(tag)!=expectedTag)
            {
                return false;
            }

            return true;
        }
    protected:
        ComputationNetwork<ElemType>* m_net;

        intargvector m_layerSizes;
        bool m_applyMeanVarNorm;
        bool m_needPrior;

        DEVICEID_TYPE m_deviceId;
        bool m_uniformInit;

        ElemType m_initValueScale;
        bool m_addDropoutNodes;

        stringargvector m_nonLinearFunctions;

        TrainingCriterion m_trainCriterion;
        EvalCriterion m_evalCriterion;

        intargvector m_directConnect; /// connect those layers directly in a sequence order
        /// for example: 1:2:3 will connect 1 to 2 and then 2 to 3

        /// recurrent network
        intargvector m_recurrentLayers;
        float m_defaultHiddenActivity;
        RNNTYPE m_rnnType;
        int   m_maOrder; /// MA model order

        bool m_constForgetGateValue;
        bool m_constInputGateValue;
        bool m_constOutputGateValue;

        ElemType m_forgetGateInitVal;
        ElemType m_inputGateInitVal;
        ElemType m_outputGateInitVal;

        int m_lookupTableOrder;
        int m_labelEmbeddingSize;
    };

}}}