// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #pragma once #include "Basics.h" #include "Matrix.h" #include "BestGpu.h" #include "ComputationNetwork.h" #include "Config.h" // TODO: giving up moving stuff for now, running out of time. The following #includes should not be necessary once the hard-working code in here gets moved to .cpp #include "InputAndParamNodes.h" #include #include #include #pragma warning(disable : 4661) using namespace std; // TODO: ugh! /// this is for sparse input, useful when input dimension is very large and sparse such as language modeling /// to-do: need to use it guided by argument #define SPARSE_INPUT namespace Microsoft { namespace MSR { namespace CNTK { #define MAX_DEPTH 20 // the standard network kinds that can be built with SimpleNetworkBuilder enum StandardNetworkKind { // basic FFDNNKind = 0, // basic feed-forward RNNKind = 1, // basic RNN LSTMKind = 2, // basic LSTM // class-based ClassEntropyRNNKind = 8, // class-based RNN ClassLSTMNetworkKind = 64, // class-based LSTM // advanced LogBilinearNetworkKind = 16, // log-bilinear model for language modeling DNNLMNetworkKind = 32, // DNN-based LM NCELSTMNetworkKind = 128, // NCE LSTM ConditionalLSTMNetworkKind = 256, // conditional LM for text generation CRFLSTMNetworkKind = 512, // sequential LSTM }; enum class TrainingCriterion : int // TODO: camel-case these { CrossEntropyWithSoftmax, CrossEntropy, SquareError, Logistic, ClassCrossEntropyWithSoftmax, NCECrossEntropyWithSoftmax, CRF, SequenceWithSoftmax }; enum class EvalCriterion : int { CrossEntropyWithSoftmax, CrossEntropy, SquareError, Logistic, ClassificationError, ClassCrossEntropyWithSoftmax, NCECrossEntropyWithSoftmax, CRF, SequenceWithSoftmax }; TrainingCriterion ParseTrainingCriterionString(wstring s); EvalCriterion ParseEvalCriterionString(wstring s); template class SimpleNetworkBuilder { protected: typedef shared_ptr> ComputationNodePtr; private: SimpleNetworkBuilder() // disable default constructor from being called { } public: SimpleNetworkBuilder(const ConfigParameters& config) : m_net(nullptr) { Init(config); } SimpleNetworkBuilder(const ScriptableObjects::IConfigRecord&) { NOT_IMPLEMENTED; } // full parameter Init routine void Init(const intargvector& layerSizes, const TrainingCriterion trainCriterion, const EvalCriterion evalCriterion, DEVICEID_TYPE deviceId, int outputLayerSize = -1, const stringargvector nonLinearFunctions = L"Sigmoid", const bool addDropoutNodes = false, const bool uniformInit = true, const ElemType initValueScale = 1.0f, const bool applyMeanVarNorm = false, bool needPrior = false) { m_deviceId = deviceId; m_net = make_shared(m_deviceId); if (m_deviceId < 0) fprintf(stderr, "SimpleNetworkBuilder Using CPU\n"); else fprintf(stderr, "SimpleNetworkBuilder Using GPU %d\n", m_deviceId); m_outputLayerSize = outputLayerSize; m_layerSizes = layerSizes; m_applyMeanVarNorm = applyMeanVarNorm; m_trainCriterion = trainCriterion; m_evalCriterion = evalCriterion; m_addDropoutNodes = addDropoutNodes; m_needPrior = needPrior; m_nonLinearFunctions = nonLinearFunctions; m_uniformInit = uniformInit; m_initValueScale = initValueScale; if (m_layerSizes.size() < 2) InvalidArgument("A network should have at least two layers (one input and one output)"); } void InitAttentionNetworkConfig(const ConfigParameters& config) { m_auxFeatDim = config("auxfeatdim", "20"); } virtual void InitRecurrentConfig(const ConfigParameters& config) { ConfigArray rLayerSizes = config("recurrentLayer", ""); intargvector recurrentLayers = rLayerSizes; m_recurrentLayers = recurrentLayers; m_defaultHiddenActivity = config("defaultHiddenActivity", "0.1"); // TODO: spelling, should be -Activation ConfigArray str_rnnType = config("rnnType", L"SIMPLENET"); // TODO: camelCase m_maOrder = config("maOrder", "0"); m_lookupTableOrder = config("lookupTableOrder", "0"); // TODO: What is this? ConfigArray sSizes = config("streamSizes", ""); m_streamSizes = sSizes; sSizes = config("lookupTableOrderSizes", ""); // this allows having a multiple streams of inputs with // different lookuptable order sizes. the older one lookupTableOrder is still kept to have backward // support. m_lookupTabelOrderSizes = sSizes; m_labelEmbeddingSize = config("labelEmbeddingSize", "10"); m_constForgetGateValue = config("constForgetGateValue", "false"); m_constInputGateValue = config("constInputGateValue", "false"); m_constOutputGateValue = config("constOutputGateValue", "false"); m_forgetGateInitVal = config("forgetGateInitVal", "-1"); m_inputGateInitVal = config("inputGateInitVal", "-1"); m_outputGateInitVal = config("outputGateInitVal", "-1"); m_sparse_input = config("sparseinput", "false"); // TODO: use EqualCI(), and use camelCase, e.g. classLSTM stringargvector strType = str_rnnType; if (std::find(strType.begin(), strType.end(), L"SIMPLENET") != strType.end()) // TODO: camelCase m_standardNetworkKind = FFDNNKind; else if (std::find(strType.begin(), strType.end(), L"SIMPLERNN") != strType.end()) // TODO: camelCase m_standardNetworkKind = RNNKind; else if (std::find(strType.begin(), strType.end(), L"LSTM") != strType.end()) m_standardNetworkKind = LSTMKind; else if (std::find(strType.begin(), strType.end(), L"CLASSLM") != strType.end()) // TODO: camelCase m_standardNetworkKind = ClassEntropyRNNKind; else if (std::find(strType.begin(), strType.end(), L"LBLM") != strType.end()) m_standardNetworkKind = LogBilinearNetworkKind; else if (std::find(strType.begin(), strType.end(), L"NPLM") != strType.end()) m_standardNetworkKind = DNNLMNetworkKind; else if (std::find(strType.begin(), strType.end(), L"CLASSLSTM") != strType.end()) // TODO: camelCase m_standardNetworkKind = ClassLSTMNetworkKind; else if (std::find(strType.begin(), strType.end(), L"NCELSTM") != strType.end()) m_standardNetworkKind = NCELSTMNetworkKind; else if (std::find(strType.begin(), strType.end(), L"CLSTM") != strType.end()) m_standardNetworkKind = ConditionalLSTMNetworkKind; else if (std::find(strType.begin(), strType.end(), L"CRF") != strType.end()) m_standardNetworkKind = CRFLSTMNetworkKind; else InvalidArgument("InitRecurrentConfig: unknown value for rnnType parameter '%ls'", strType[0].c_str()); } // Init - Builder Initialize for multiple data sets // config - [in] configuration parameters for the network builder virtual void Init(const ConfigParameters& config) { DEVICEID_TYPE deviceId = DeviceFromConfig(config); ElemType initValueScale = config("initValueScale", "1.0"); ConfigArray layerTypes = config("layerTypes", L"Sigmoid"); // TODO: camelCase stringargvector nonlinearFunctions = layerTypes; bool uniformInit = config("uniformInit", "true"); bool applyMeanVarNorm = config("applyMeanVarNorm", "false"); bool needPrior = config("needPrior", "false"); bool addDropoutNodes = config("addDropoutNodes", "false"); int outputLayerSize; ConfigArray layerSizes; intargvector layers; TrainingCriterion trainingCriterion; EvalCriterion evalCriterion; outputLayerSize = config("outputLayerSize", "-1"); layerSizes = config("layerSizes", "100"); layers = layerSizes; trainingCriterion = ParseTrainingCriterionString(config("trainingCriterion")); evalCriterion = ParseEvalCriterionString(config("evalCriterion")); ConfigArray rDirect = config("directConnect", ""); m_directConnect = rDirect; m_word2class = config("word2cls", ""); m_cls2index = config("cls2index", ""); m_vocabSize = (int) config("vocabSize", "-1"); m_nbrCls = (int) config("nbrClass", "-1"); nce_noises = (int) config("noise_number", "-1"); // nce noise Init(layers, trainingCriterion, evalCriterion, deviceId, outputLayerSize, nonlinearFunctions, addDropoutNodes, uniformInit, initValueScale, applyMeanVarNorm, needPrior); InitRecurrentConfig(config); InitAttentionNetworkConfig(config); } ComputationNetworkPtr BuildNetworkFromDescription(); ComputationNetworkPtr BuildNetworkFromDbnFile(const std::wstring& dbnModelFileName); // legacy support for fseide's Microsoft-internal tool "DBN.exe" protected: ComputationNetworkPtr BuildFFDNNFromDescription(); ComputationNetworkPtr BuildRNNFromDescription(); ComputationNetworkPtr BuildClassEntropyRNNFromDescription(); ComputationNetworkPtr BuildLogBilinearNetworkFromDescription(); ComputationNetworkPtr BuildDNNLMNetworkFromDescription(); ComputationNetworkPtr BuildLSTMNetworkFromDescription(); #ifdef COMING_SOON ComputationNetworkPtr BuildCRFLSTMNetworkFromDescription(); #endif ComputationNetworkPtr BuildClassLSTMNetworkFromDescription(); ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription(); ComputationNetworkPtr BuildNCELSTMNetworkFromDescription(); // mulitply used components ComputationNodePtr BuildLSTMComponent(unsigned long& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input); ComputationNodePtr BuildLSTMNodeComponent(ULONG& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input); ComputationNodePtr BuildDirectConnect(unsigned long& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode); // layer is 0 based ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L""); ComputationNodePtr AddTrainAndEvalCriterionNodes(ComputationNodePtr input, ComputationNodePtr label, ComputationNodePtr matrix = nullptr, const std::wstring trainNodeName = L"", const std::wstring evalNodeName = L"", ComputationNodePtr clspostprob = nullptr, ComputationNodePtr trans = nullptr); static bool CheckDbnTag(File& fstream, const std::string expectedTag) { char tag[5]; for (int i = 0; i < 4; i++) fstream >> tag[i]; tag[4] = 0; return std::string(tag) == expectedTag; } Matrix ReadMatrixFromDbnFile(File& fstream, const std::string expectedName) { int numRows, numCols; std::string name; if (!CheckDbnTag(fstream, "BMAT")) RuntimeError("Error reading DBN file - did not find expected tag BMAT\n"); // fstream.GetMarker(FileMarker::fileMarkerBeginSection, "BMAT"); fstream >> name >> numRows >> numCols; if (name != expectedName) { InvalidArgument("ERROR reading pretrained DBN file, expected name %s, found name %s\n", expectedName.c_str(), name.c_str()); } if (numCols > 1) // transpose W because dbn stores that way apparently { int origRows = numRows; numRows = numCols; numCols = origRows; } Matrix mat(numRows, numCols, m_deviceId); // dbn operates on row vectors not column vectors. x*W + b, so need to read in as W' // ElemType* d_array = new ElemType[numRows*numCols]; float tmp; for (long i = 0; i < numRows; i++) for (long j = 0; j < numCols; j++) { fstream >> tmp; mat(i, j) = tmp; // d_array[i] = (ElemType)tmp; } if (!CheckDbnTag(fstream, "EMAT")) RuntimeError("Error reading DBN file - did not find expected tag EMAT\n"); // fstream.GetMarker(FileMarker::fileMarkerBeginSection, "EMAT"); return mat; } protected: ComputationNetworkPtr m_net; int m_outputLayerSize; intargvector m_layerSizes; bool m_applyMeanVarNorm; bool m_needPrior; DEVICEID_TYPE m_deviceId; bool m_uniformInit; ElemType m_initValueScale; bool m_addDropoutNodes; stringargvector m_nonLinearFunctions; TrainingCriterion m_trainCriterion; EvalCriterion m_evalCriterion; intargvector m_directConnect; // connect those layers directly in a sequence order // for example: 1:2:3 will connect 1 to 2 and then 2 to 3 // recurrent network intargvector m_recurrentLayers; float m_defaultHiddenActivity; StandardNetworkKind m_standardNetworkKind; int m_maOrder; // MA model order bool m_constForgetGateValue; bool m_constInputGateValue; bool m_constOutputGateValue; ElemType m_forgetGateInitVal; ElemType m_inputGateInitVal; ElemType m_outputGateInitVal; intargvector m_streamSizes; // for multiple stream data intargvector m_lookupTabelOrderSizes; // each stream has its own projection, so need to provide with the lookup table order size for each stream int m_lookupTableOrder; int m_labelEmbeddingSize; // these are the file names for word 2 class mapping and class to word index mapping // these are used for class-based language modeling string m_cls2index; string m_word2class; int m_nbrCls; // number of classes int m_vocabSize; // vocabulary size int nce_noises; bool m_sparse_input; /** for attention network development */ size_t m_auxFeatDim; }; } } }