diff --git a/Makefile b/Makefile index f1f95c9b3..a730f41dc 100644 --- a/Makefile +++ b/Makefile @@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug") CXXFLAGS += -g LDFLAGS += -rdynamic CPPFLAGS += -D_DEBUG - CUFLAGS += -O0 -use_fast_math -lineinfo $(GENCODE_FLAGS) + CUFLAGS += -O0 -g -use_fast_math -lineinfo $(GENCODE_FLAGS) endif ifeq ("$(BUILDTYPE)","release") diff --git a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp index a0a3bf769..2a707b831 100644 --- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp +++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp @@ -47,7 +47,7 @@ using namespace std; L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n" L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n" // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation. - L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, numSteps=1, insertedDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n" + L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n" L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n" L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n" L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n" diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index e7753e2c7..600f0ae06 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -345,6 +345,9 @@ void PrintBuiltInfo() #ifdef _CUB_PATH_ fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_); #endif +#ifdef _CUDNN_PATH_ + fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_); +#endif #ifdef _GIT_EXIST fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_); fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_); @@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i RedirectStdErr(logpath); } - PrintBuiltInfo(); + PrintBuiltInfo(); // this one goes to log file std::string timestamp = TimeDateStamp(); //dump config info @@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i // main wrapper that catches C++ exceptions and prints them // --------------------------------------------------------------------------- -int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions +int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions { try { + PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type) if (argc <= 1) InvalidArgument("No command-line argument given."); // detect legacy CNTK configuration @@ -684,6 +688,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions { set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating + _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing + // Note: this does not seem to work--processes with this seem to just hang instead of terminating __try { diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp index 9b1ec6fa1..612b96e63 100644 --- a/Source/CNTK/ModelEditLanguage.cpp +++ b/Source/CNTK/ModelEditLanguage.cpp @@ -100,7 +100,7 @@ template void MELScript::CallFunction(const std::string& p_name, const ConfigParamList& params) { std::string name = p_name; - if (EqualInsensitive(name, "CreateModel")) //create a blank model + if (EqualInsensitive(name, "CreateModel")) // create a blank model { size_t numFixedParams = 0, numOptionalParams = 0; if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams) @@ -109,7 +109,7 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa auto cn = make_shared(CPUDEVICE); OverrideModelNameAndSetDefaultModel(cn); } - if (EqualInsensitive(name, "CreateModelWithName")) //create a blank model + if (EqualInsensitive(name, "CreateModelWithName")) // create a blank model { size_t numFixedParams = 1, numOptionalParams = 0; if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams) @@ -139,6 +139,16 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams); auto cn = make_shared(CPUDEVICE); +#if 1 // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them + if (modelFormat == L"cntk_legacy_no_tensorlib") + { + cn->Read(params[1]); + for (auto node : cn->FeatureNodes()) + node->SetDims(TensorShape(node->GetNumRows()), 0); // pre-tensorlib InputValues had incorrect tensor dimensions + cn->CompileNetwork(); + } + else +#endif cn->Load(params[1]); OverrideModelNameAndSetDefaultModel(cn, params[0]); } @@ -189,8 +199,7 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa // validate the network before we save it out ProcessNDLScript(m_netNdlDefault, ndlPassAll, true); - - cn->Save(fileName); + cn->SaveEdited(fileName); } else if (EqualInsensitive(name, "SaveModel")) { @@ -209,7 +218,7 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa // validate and finish the second pass through NDL if any in-line NDL was defined ProcessNDLScript(netNdl, ndlPassAll, true); - netNdl->cn->Save(fileName); + netNdl->cn->SaveEdited(fileName); } else if (EqualInsensitive(name, "SetDefaultModel")) { diff --git a/Source/CNTK/ModelEditLanguage.h b/Source/CNTK/ModelEditLanguage.h index 117470b80..538922692 100644 --- a/Source/CNTK/ModelEditLanguage.h +++ b/Source/CNTK/ModelEditLanguage.h @@ -443,6 +443,10 @@ public: { modelFormat = L"cntk"; } + else if (EqualInsensitive(value, "cntk_legacy_no_tensorlib")) // model of late 2015 which had a bug in setting InputValue's tensor dimensions + { + modelFormat = L"cntk_legacy_no_tensorlib"; + } else { RuntimeError("Invalid optional parameter value %s, valid values are: format=(cntk)", value.c_str()); diff --git a/Source/CNTK/SimpleNetworkBuilder.cpp b/Source/CNTK/SimpleNetworkBuilder.cpp index 6ab9a5204..bf76efded 100644 --- a/Source/CNTK/SimpleNetworkBuilder.cpp +++ b/Source/CNTK/SimpleNetworkBuilder.cpp @@ -2423,9 +2423,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu")); assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize); - w = builder.Mean(label, L"Prior"); - static_pointer_cast>(w)->SideLoadFromMatrix(priorVals); - w->SetParameterUpdateRequired(false); + prior = builder.Mean(label, L"Prior"); + static_pointer_cast>(prior)->SideLoadFromMatrix(priorVals); + prior->SetParameterUpdateRequired(false); } else // pretrained network - need to add output layer, initalize { @@ -2465,7 +2465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (layerType == "perceptron" || m_needPrior) { - input = builder.Log(pcNodePtr, L"LogOfPrior"); + input = builder.Log(prior, L"LogOfPrior"); //following two lines is needed only if true probability is needed //output = builder.Softmax(output); diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat index 9f841d104..12631cf52 100644 --- a/Source/CNTK/prebuild.bat +++ b/Source/CNTK/prebuild.bat @@ -33,6 +33,16 @@ if "%cuda_path%" == "" ( echo #define _CUDA_PATH_ "%cuda_path:\=\\%" >> buildinfo.h$$ ) +if not "%cudnn_path%" == "" ( + echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$ + ) + +if not "%cub_path%" == "" ( + echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$ + ) + + + echo #endif >> buildinfo.h$$ ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index c70862369..f3af694fb 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -84,6 +84,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { ptrdiff_t tBegin; // first time index in this minibatch. Note that this may be negative of the sequence started before this MB. size_t tEnd; // end = first frame index after final frame. May be beyond the minibatch if reql sequence is longer than the MB. bool operator==(const SequenceInfo & other) const { return seqId == other.seqId && s == other.s && tBegin == other.tBegin && tEnd == other.tEnd; } + size_t GetNumTimeSteps() const { return (size_t)(tEnd - tBegin); } }; // ------------------------------------------------------------------- @@ -270,6 +271,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them. void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); } + // find a sequence by its id + const SequenceInfo & FindSequence(UniqueSequenceId seqId) const + { + for (const auto & seqInfo : m_sequences) + if (seqInfo.seqId == seqId) + return seqInfo; + LogicError("FindSequence: Requested sequence (id %u) not found.", (unsigned int) seqId); + } + // ------------------------------------------------------------------- // inquire about gaps or boundaries // ------------------------------------------------------------------- @@ -427,6 +437,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: // TODO: make private (currently used from masking and DataFor) ; TODO: rename all members with m_ prefix size_t timeIdxInSeq; // start frame; SIZE_MAX = all frames in MB ptrdiff_t m_timeOffset; // this is added to timeIdxInSeq wherever it is used + size_t m_timeRange; // use this to describe a custom range > 1 frame size_t seqIndex; // parallel-sequence index; SIZE_MAX = all sequences in MB (most common case) --TODO: Bad name, 'sequence' and 'parallel sequence' are two different things MBLayoutPtr m_pMBLayout; // layout associated with this bool m_broadcastAllowed; // frame range may be broadcast from outer layout (e.g. a matrix with NULL layout and 1 column is acceptable to this frame range) @@ -434,7 +445,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: // can construct from a single size_t -> a single-frame range - FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {} + FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), m_timeRange(1), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {} // or without arguments -> entire minibatch / no frame-range FrameRange(MBLayoutPtr pMBLayout) : FrameRange(pMBLayout, SIZE_MAX) {} @@ -471,7 +482,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } // create a FrameRange with a time offset - // Note: This currently does not work in conjunction with IsAllFrames(). This would be a nice-to have, but tricky w.r.t. out-of-bounds accesses. + // If IsAllFrames() then this will cause out-of-bounds slices. FrameRange WithTimeOffset(ptrdiff_t offset) const { FrameRange ret = *this; @@ -479,6 +490,24 @@ namespace Microsoft { namespace MSR { namespace CNTK { return ret; } + // create a FrameRange with a time range > 1 + FrameRange WithTimeRange(size_t range) const + { + FrameRange ret = *this; + if (!IsAllFrames()) + ret.m_timeRange = range; + return ret; + } + + // dimension we are iterating over; -1 means time dimension; 0 means no layout + int GetIterationDimension() const + { + if (!m_pMBLayout) + return 0; + else + return -1; // TODO: allow user to specify other dimensions + } + class IndexIteration // range for range-based for over sequences { size_t m_beginIndex, m_endIndex; @@ -753,7 +782,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (startColumn >= numCols) LogicError("DataFor: FrameRange specifies a time index that is out of range."); if (fr.seqIndex == SIZE_MAX) - return std::pair(startColumn, numParallelSequences); + return std::pair(startColumn, numParallelSequences * fr.m_timeRange); + else if (fr.m_timeRange != 1) + LogicError("DataFor: FrameRange only support per-sequence time ranges with tensor slices, not matrix slices."); else return std::pair(startColumn + fr.seqIndex, 1); } @@ -778,7 +809,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TensorSliceWithMBLayoutFor() -- Return tensor slice for a FrameRange with specified number of columns with a given MBLayout // This implements the logic of interpreting the FrameRange object. // Unlike the matrix version above, this supports iteration indices other than time. - // TODO: This ^^. Still missing is a field to identify the index. + // TODO: This ^^. FrameRange still missing is a field to identify the index. + // This function happily returns tensor bounds that are out of bounds, assuming caller will do the right thing. // ----------------------------------------------------------------------- template // e.g. std::vector or SmallVector @@ -787,6 +819,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { const MBLayoutPtr & pMBLayout/*the MB layout of 'data'*/) { std::pair result; + typedef decltype(result.first[0]) ElemType; // this creates a slice for the entire matrix, which we will then narrow down result.first.resize(shape.size(), 0); @@ -795,8 +828,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { // get position of time and sequence index // These are only valid if we have a layout. // In the future, the 'timeDim' will be identified by the FrameRange. + int iterDimParam = fr.GetIterationDimension(); + size_t iterDim = iterDimParam > 0 ? iterDimParam - 1/*regular dimensions are specified as 1-based*/ : shape.size() + iterDimParam/*-1 for time dimension*/; size_t sequenceDim = shape.size() - 2; // TODO: In case of multiple time dims, this must be adjusted. - size_t timeDim = sequenceDim + 1; // TODO: Get this from the FrameRange object. // MBLayout of data and of FrameRange must be identical pointers, // or in case of broadcasting, respective parent pointers. @@ -819,28 +853,33 @@ namespace Microsoft { namespace MSR { namespace CNTK { // but as a reference (e.g. it cannot be resized) else if (!pMBLayout || fr.IsAllFrames()) { - if (fr.m_timeOffset != 0) // entire minibatch with non-zero offset exceeds bounds on at least one side - LogicError("DataFor: Iteration offset must not be specified for FrameRanges that reference the entire minibatch."); - // TODO: Can we allow this? Semantics would be different, it would crop frames outside. + if (fr.m_timeOffset) + { + if (iterDim >= result.first.size()) + LogicError("DataFor: Time offset cannot be applied to tensors that have no time dimension."); + result.first[iterDim] += (ElemType)fr.m_timeOffset; // Note: If we have an offset, this is guaranteed to yield a slice that is out of bounds. + result.second[iterDim] += (ElemType)fr.m_timeOffset; + if (result.first[iterDim] > result.second[iterDim]) + LogicError("DataFor: Numeric wraparound. You used a size_t vector where an int vector would be needed."); + } } // FrameRange refers to a time slice -> return that - else if (result.second[timeDim] > 1) // (if time dim is broadcasting then always return that one independent of requested index) + else if (result.second[iterDim] > 1) // (if time dim is broadcasting then always return that one independent of requested index) { - size_t t = fr.timeIdxInSeq + fr.m_timeOffset; - if (t >= result.second[timeDim]) - LogicError("DataFor: FrameRange specifies an iteration index that is out of range."); - result.first[timeDim] = t; - result.second[timeDim] = t + 1; + size_t ts = fr.timeIdxInSeq + fr.m_timeOffset; + size_t te = ts + fr.m_timeRange; + result.first[iterDim] = (ElemType)ts; + result.second[iterDim] = (ElemType)te; } - + // sequence index if (fr.seqIndex != SIZE_MAX/*sequence requested*/ && pMBLayout/*have sequences*/ && result.second[sequenceDim] > 1/*>1 sequence (not broadcasting)*/) { size_t s = fr.seqIndex; if (s >= result.second[sequenceDim]) LogicError("DataFor: FrameRange specifies a paralllel-sequence index that is out of range."); - result.first[sequenceDim] = s; - result.second[sequenceDim] = s + 1; + result.first[sequenceDim] = (ElemType)s; + result.second[sequenceDim] = (ElemType)s + 1; } return result; diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h index 49e563c82..bedecbce3 100644 --- a/Source/Common/Include/TensorShape.h +++ b/Source/Common/Include/TensorShape.h @@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void resize(size_t sz, const T & val) { if (sz < m_size) m_size = sz; else while (m_size < sz) push_back(val); } void assign(size_t sz, const T & val) { clear(); resize(sz, val); } template - void append(ITER beg, const ITER & end) { while (beg != end) push_back(*beg++); } + void append(ITER beg, const ITER & end) { while (beg != end) push_back((T)*beg++); } // typecast allows signed/unsigned conversions template void assign(ITER beg, const ITER & end) { clear(); append(beg,end); } void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); } @@ -180,8 +180,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { // boilerplate bool operator==(const TensorShape & other) const { return m_dims == other.m_dims; } - void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable. - // verify that this refers to a dense matrix (no strides) void VerifyIsDense() const { @@ -374,7 +372,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (size() != bounds.first.size() || size() != bounds.second.size()) LogicError("NarrowedTo: Bounds parameter must have same rank as tensor."); for (size_t k = 0; k < size(); k++) - if (bounds.second[k] <= bounds.first[k] || bounds.second[k] > m_dims[k]) + if (bounds.second[k] <= bounds.first[k] || (size_t)bounds.second[k] > m_dims[k]) LogicError("NarrowedTo: Invalid bounds parameter, dimensions must be at least one."); for (size_t k = 0; k < size(); k++) { diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h index d1411396d..ca489ad5c 100644 --- a/Source/Common/Include/latticearchive.h +++ b/Source/Common/Include/latticearchive.h @@ -51,6 +51,7 @@ enum mbrclassdefinition // used to identify definition of class in minimum b // =========================================================================== class lattice { + mutable int verbosity; struct header_v1_v2 { size_t numnodes : 32; @@ -567,11 +568,13 @@ private: std::vector backptroffsets; // TODO: we could change this to 'unsigned int' to save some transfer time std::vector backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer size_t numofstates; // per sil hmm + int verbosity; public: - backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0) + backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0) { size_t edgeswithsilence = 0; // (diagnostics only: number of edges with at least one /sil/) size_t backptrbufsize = 0; // number of entries in buffer for silence backpointer array, used as cursor as we build it + backptroffsets.resize (L.edges.size() + 1); // +1, so that the final entry determines the overall size of the allocated buffer const size_t silUnitId = hset.gethmmid ("sil"); numofstates = hset.gethmm (silUnitId).getnumstates(); @@ -595,15 +598,18 @@ private: #if 1 // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is) if (numsilunits > 1) { - fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits); - fprintf (stderr, "alignments: :"); - foreach_index (a, aligntokens) + if (verbosity) { - const auto & unit = aligntokens[a]; - const auto & hmm = hset.gethmm (unit.unit); - fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f); + fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits); + fprintf(stderr, "alignments: :"); + foreach_index(a, aligntokens) + { + const auto & unit = aligntokens[a]; + const auto & hmm = hset.gethmm(unit.unit); + fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f); + } + fprintf(stderr, "\n"); } - fprintf (stderr, "\n"); } #endif if (numsilunits > 0) @@ -611,7 +617,8 @@ private: backptrbufsize += maxsilframes * numofstates; } backptroffsets[L.edges.size()] = backptrbufsize; // (TODO: remove if not actually needed) - fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size())); + if (verbosity) + fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size())); } // CUDA support const std::vector & getbackptroffsets() const { return backptroffsets; } @@ -1002,6 +1009,10 @@ public: std::wstring key; // (keep our own name (key) so we can identify ourselves for diagnostics messages) const wchar_t * getkey() const { return key.c_str(); } + + void setverbosity(int veb) const{ + verbosity = veb; + } }; // =========================================================================== @@ -1016,6 +1027,8 @@ class archive // set of lattice archive files referenced // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files. std::vector archivepaths; // [archiveindex] -> archive path + std::wstring prefixPathInToc; // prefix path in a toc; using this to avoid pushd some path before start training + mutable int verbosity; size_t getarchiveindex (const std::wstring & path) // get index of a path in archivepaths[]; create new entry if needed { auto iter = std::find (archivepaths.begin(), archivepaths.end(), path); @@ -1042,7 +1055,8 @@ class archive { // need to read the map and establish the mapping // get the symlist file const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist"; - fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str()); + if (verbosity>0) + fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str()); std::vector textbuffer; auto lines = msra::files::fgetfilelines (symlistpath, textbuffer); // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found @@ -1092,19 +1106,25 @@ class archive public: // construct = open the archive //archive() : currentarchiveindex (SIZE_MAX) {} - + void setverbosity(int veb) const + { + verbosity = veb; + } // test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode) bool empty() const { return archivepaths.empty(); } // construct from a list of TOC files - archive (const std::vector & tocpaths, const std::unordered_map & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap) + archive (const std::vector & tocpaths, const std::unordered_map & modelsymmap, const std::wstring prefixPath=L"") + : currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0) { if (tocpaths.empty()) // nothing to read--keep silent return; fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str()); + size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1; foreach_index (i, tocpaths) { - fprintf (stderr, "."); + if ( (i % onepercentage) == 0) + fprintf (stderr, "."); open (tocpaths[i]); } fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size()); @@ -1135,7 +1155,11 @@ public: RuntimeError("open: invalid TOC line (no [): %s", line); if (q != p) { - const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p)); + std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p)); + if (!prefixPathInToc.empty()) + { + archivepath = prefixPathInToc + L"/" + archivepath; + } // TODO: should we allow paths relative to TOC file? archiveindex = getarchiveindex (archivepath); } @@ -1207,6 +1231,7 @@ public: fsetpos (f, offset); // get it L.fread (f, idmap, spunit); + L.setverbosity(verbosity); #ifdef HACK_IN_SILENCE // hack to simulate DEL in the lattice const size_t silunit = getid (modelsymmap, "sil"); const bool addsp = true; diff --git a/Source/Common/Include/latticesource.h b/Source/Common/Include/latticesource.h index fcf046b68..0ec12508e 100644 --- a/Source/Common/Include/latticesource.h +++ b/Source/Common/Include/latticesource.h @@ -23,10 +23,11 @@ public: class latticesource { const msra::lattices::archive numlattices, denlattices; + int verbosity; public: typedef msra::dbn::latticepair latticepair; - latticesource (std::pair,std::vector> latticetocs, const std::unordered_map & modelsymmap) - : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {} + latticesource (std::pair,std::vector> latticetocs, const std::unordered_map & modelsymmap, std::wstring RootPathInToc) + : numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {} bool empty() const { @@ -52,6 +53,12 @@ public: denlattices.getlattice (key, LP->second, expectedframes); // this loads the lattice from disk, using the existing L.second object L = LP; } + + void setverbosity(int veb) + { + verbosity = veb; + numlattices.setverbosity(veb); denlattices.setverbosity(veb); + } }; }} \ No newline at end of file diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index f8f79dc21..8400407b7 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -296,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_value); m_value->SetValue(value); m_hasComputed = true; + SetDims(TensorShape(value.GetNumRows()), value.GetNumCols()); } public: bool m_hasComputed; diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 8da7ba6c7..75c989bc5 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -62,6 +62,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // break cycles // BUGBUG: This only works if nodes are not shared across networks. // Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard. + // Or just use weak ptrs. for (auto & iter : m_nameToNodeMap) iter.second->DetachInputs(); @@ -74,8 +75,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { // serialization // ----------------------------------------------------------------------- + // after after editing--network is possibly not validated/compiled + void ComputationNetwork::SaveEdited(const wstring& fileName, const FileOptions fileFormat) + { + if (!IsCompiled()) + CompileNetwork(); + Save(fileName, fileFormat); + } + void ComputationNetwork::Save(const wstring& fileName, const FileOptions fileFormat) const { + VerifyIsCompiled("Save"); // In case of parallel training only the main node should we saving the model to prevent // the parallel training nodes from colliding to write the same file // TODO: This does not belong here. @@ -182,7 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // load the section of nodes that contain persistable parameters // This is used for reloading a model without recreating it, e.g. during training. // TODO: Why not just reload it? Because SGD::Train() holds pointers to the parameters directly? That should be fixed. - template void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create) + template void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create) { fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN"); @@ -221,47 +231,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList"); } - template void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork) + // deserialize the model + // This does not post-process the model (CompileNetwork()). Use Load() instead. + template void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork) { ClearNetwork(); File fstream(fileName, fileFormat | FileOptions::fileOptionsRead); -#if 1 - LoadPersistableParameters(fstream, true); -#else - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN"); - - // model version - size_t modelVersion = CNTK_MODEL_VERSION_1; //if version info is not there it is version 1 - if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BVersion")) - { - fstream >> modelVersion; - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion"); - } - - size_t numNodes; - fstream >> numNodes; - - // get all node info first - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BNodeList"); - for (size_t i = 0; i < numNodes; i++) - { - wstring opName, nodeName; - fstream >> opName >> nodeName; - - auto newNode = ComputationNetworkBuilder::NewNode(opName, m_deviceId, nodeName); - - if (!newNode) - { - fprintf(stderr, "Unknown ComputationNode type %ls (node name %ls)\n", opName.c_str(), nodeName.c_str()); - InvalidArgument("Invalid node type."); - } - newNode->Load(fstream, modelVersion); - AddNodeToNet(newNode); - } - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList"); -#endif + ReadPersistableParameters(fstream, true); size_t numNodes = m_nameToNodeMap.size(); @@ -277,9 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { vector childrenNames; childrenNames.resize(numChildren); for (size_t j = 0; j < numChildren; j++) - { fstream >> childrenNames[j]; - } // TODO: how does the file distinguish float from double? ComputationNodeBasePtr nodePtr = GetNodeFromName(nodeName); @@ -288,42 +264,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (int j = 0; j < numChildren; j++) childrenNodes[j] = GetNodeFromName(childrenNames[j], anotherNetwork); - //if (nodePtr->OperationName() == OperationNameOf(RowStackNode)) - //{ - // allow for variable input nodes - nodePtr->AttachInputs(childrenNodes); - //} - //else - //{ - // // fixed input nodes - // // TODO: Use the variable-length AttachInputs() as well. This is a refactoring left-over. - // switch (numChildren) - // { - // case 1: - // nodePtr->AttachInputs(childrenNodes[0]); - // break; - // case 2: - // nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]); - // break; - // case 3: - // nodePtr->AttachInputs(childrenNodes[0],childrenNodes[1], childrenNodes[2]); - // break; - // case 4: - // nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]); - // break; - // case 5: - // nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]); - // break; - // case 6: - // nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]); - // break; - // default: - // LogicError("Invalid number of children."); - // } - //} + nodePtr->AttachInputs(childrenNodes); } } - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERelation"); fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BRootNodes"); @@ -340,7 +283,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream >> nodeName; m_features.push_back(GetNodeFromName(nodeName)); } - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EFeatureNodes"); } @@ -353,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_labels.push_back(GetNodeFromName(nodeName)); } } - + // BUGBUG: Should this be inside the block? fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELabelNodes"); if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BCriterionNodes") || @@ -372,13 +314,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - // TODO: this section is defunct + // TODO: this section is defunct, skip over if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BNodesReqMultiSeqHandling")) { fprintf(stderr, "WARNING: Ignoring defunct 'BNodesReqMultiSeqHandling' section in input file.\n"); fstream >> num; for (size_t i = 0; i < num; i++) - fstream >> nodeName; + fstream >> nodeName; // dummy fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling"); } @@ -415,13 +357,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EPairNodes"); } } - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERootNodes"); fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECN"); - - // perform all further post-processing, caching, etc. - CompileNetwork(); } // ----------------------------------------------------------------------- @@ -622,9 +560,21 @@ namespace Microsoft { namespace MSR { namespace CNTK { //set sequence training parameters, e.g. smoothing weight, frame drop threshhold template - void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign) + void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, + const ComputationNodeBasePtr criterionNode, + const double& hsmoothingWeight, + const double& frameDropThresh, + const bool& doreferencealign, + const double& amf /*= 14.0f*/, + const double& lmf /*= 14.0f*/, + const double& wp /*= 0.0f*/, + const double& bMMIfactor /*= 0.0f*/, + const bool& sMBR /*= false*/ + ) { fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh); + fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n", + amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false"); list seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode); if (seqNodes.size() == 0) { @@ -638,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { node->SetSmoothWeight(hsmoothingWeight); node->SetFrameDropThresh(frameDropThresh); node->SetReferenceAlign(doreferencealign); + node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR); } } } @@ -1114,18 +1065,20 @@ namespace Microsoft { namespace MSR { namespace CNTK { } template void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly); - template void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork); - template void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create); + template void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork); + template void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create); template void ComputationNetwork::PerformSVDecomposition(const map& SVDConfig, size_t alignedsize); template /*static*/void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); - template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, + const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR); template void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly); - template void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork); - template void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create); + template void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork); + template void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create); template void ComputationNetwork::PerformSVDecomposition(const map& SVDConfig, size_t alignedsize); template /*static*/void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); - template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, + const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR); // register ComputationNetwork with the ScriptableObject system ScriptableObjects::ConfigurableRuntimeTypeRegister::Add registerComputationNetwork(L"ComputationNetwork"); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 0a9b3bf8a..426dea48f 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -78,24 +78,33 @@ public: // ----------------------------------------------------------------------- void Save(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary) const; + void SaveEdited(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary); private: void SaveToFileImpl(const std::wstring& fileName, const FileOptions fileFormat) const; public: template - void LoadPersistableParameters(File & fstream, bool create); + void ReadPersistableParameters(File & fstream, bool create); // reload node content only, e.g. used by SGD::Train() when going back to an older model that had better training objective template - void ReloadPersistableParameters(const std::wstring& fileName) + void RereadPersistableParameters(const std::wstring& fileName) { File fstream(fileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead); - LoadPersistableParameters(fstream, false); + ReadPersistableParameters(fstream, false); } // design BUGBUG: binary files do not know whether they are float or double. // TODO: modify file format to know this; then eliminate the dependency (and in some future, allow nodes to be different) template + void Read(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary, + const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr); + template void Load(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary, - const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr); + const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr) + { + Read(fileName, fileFormat, bAllowNoCriterionNode, anotherNetwork); + // perform all further post-processing, caching, etc. + CompileNetwork(); + } // static helper to instantiate a network from a file template @@ -159,9 +168,11 @@ public: private: void ValidateNodes(list nodes, bool isFinalValidationPass, size_t & todo); void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode); + void MarkValueNonSharableNodes(); private: void DetermineSetOfAllRoots(); void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode); + bool IsCompiled() const { return m_isCompiled; } void VerifyIsCompiled(const char * where) const; //bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode); public: @@ -411,8 +422,20 @@ public: template static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); + + + template - static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + static void SetSeqParam(ComputationNetworkPtr net, + const ComputationNodeBasePtr criterionNode, + const double& hsmoothingWeight, + const double& frameDropThresh, + const bool& doreferencealign, + const double& amf=14.0f, + const double& lmf=14.0f, + const double& wp=0.0f, + const double& bMMIfactor=0.0f, + const bool& sMBR=false); static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples); // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index 6dddc73a3..2f408b797 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -30,6 +30,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { std::wstring toName, const CopyNodeFlags flags) { + InvalidateCompiledNetwork(); + if (toName == L"") toName = fromName; @@ -50,11 +52,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else { - //node already exists - + // node already exists pToNode = GetNodeFromName(toName); - //same node. no copy needed + // same node. no copy needed if (pFromNode == pToNode) LogicError("CopyNode: You are copying the node to the same network with same node name."); else @@ -69,6 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::wstring fromName, std::wstring toNamePrefix, const CopyNodeFlags flags) { + InvalidateCompiledNetwork(); + if (!(flags & CopyNodeFlags::copyNodeValue)) LogicError("CopySubTree: you cannot copy a tree without copying the node values."); @@ -103,7 +106,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { // nodeNameNew - new node name void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew) { - // so that renamed node will not be referenced InvalidateCompiledNetwork(); ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig); @@ -128,7 +130,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ComputationNetwork::DeleteNode(const std::wstring & nodeName) { - // so that deleted node will not be referenced InvalidateCompiledNetwork(); ComputationNodeBasePtr nodeToDelete = GetNodeFromName(nodeName); @@ -172,6 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // need to update all the mappings as well childrens void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode) { + InvalidateCompiledNetwork(); + ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName); if (oldNode->OperationName() != newNode->OperationName()) InvalidArgument("newNode must have the same type as the old node."); @@ -204,6 +207,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // need to update those nodes who use oldNode as their child void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode) { + InvalidateCompiledNetwork(); + ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName); // change the input of those nodes whose child is oldNode @@ -223,6 +228,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode) { + InvalidateCompiledNetwork(); + // Checks if the node is a criterion node. int index = -1; for (int i = 0; i < m_finalCriteria.size(); ++i) @@ -251,6 +258,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode) { + InvalidateCompiledNetwork(); + wstring nodeName = featureNode->NodeName(); if (NodeNameExists(nodeName)) RuntimeError("AddFeatureNode: feature node already exists."); @@ -261,12 +270,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // We only remove the node, not delete it. void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode) { + InvalidateCompiledNetwork(); + wstring nodeName = featureNode->NodeName(); if (!NodeNameExists(nodeName)) RuntimeError("RemoveFeatureNode: feature node does not exist."); - InvalidateCompiledNetwork(); - // Removes links. for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter) { diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 084fe9ce9..9a297d5c7 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -10,11 +10,13 @@ #include "ComputationNode.h" #include "ComputationNetwork.h" #include "RecurrentNodes.h" +#include "InputAndParamNodes.h" #include #include #include #include #include +#include using namespace std; @@ -365,7 +367,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // verify that network has undergone CompileNetwork() void ComputationNetwork::VerifyIsCompiled(const char * where) const { - if (!m_isCompiled) + if (!IsCompiled()) LogicError("%s: A compiled network was expected.", where); } @@ -712,6 +714,63 @@ namespace Microsoft { namespace MSR { namespace CNTK { // ----------------------------------------------------------------------- // memory allocation // ----------------------------------------------------------------------- + // mark nodes that are purely induced by parameters as non-sharable and create space for value if null + void ComputationNetwork::MarkValueNonSharableNodes() + { + const auto & nodes = GetEvalOrder(nullptr); + std::map allLeafDescendentsAreParameters; + std::list allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); + // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not + + for (auto& node : nodes) + { + auto children = node->GetInputs(); + wstring myname = node->NodeName(); + bool allParameters = true; + + if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already + { + for (auto child : children) + { + wstring ChildName = child->NodeName(); + if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end()) + { + // not found, means it is a leaf node (we are at eval order ) + assert(child->IsLeaf() || child->IsPartOfLoop()); + if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end()) + { + allLeafDescendentsAreParameters[ChildName] = true; + } + else + { + allParameters = false; + allLeafDescendentsAreParameters[ChildName] = false; + break; + } + } + else + { + if (allLeafDescendentsAreParameters[ChildName] == false) + { + allParameters = false; + break; + } + } + } + allLeafDescendentsAreParameters[myname] = allParameters; + if (allParameters) + { + node->MarkValueNonSharable(); + } + else + { + node->MarkValueSharable(); + } + } + } + + } + // this function will need to be called before actual validation and execution to // predetermine how to share matrices to reduce memory usage. @@ -726,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { VerifyIsCompiled("AllocateAllMatrices"); + // Due to special topology, if a node is solely induced by parameters, its function value should not be shared + MarkValueNonSharableNodes(); + bool performingBackPropagation = (trainRootNode != nullptr); - // Create a composite Eval order with the specfied nodes as roots + // Create a composite Eval order with the specified nodes as roots std::vector forwardPropRoots; forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end()); forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end()); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index b4e0725bd..af9851bd0 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -136,7 +136,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { typedef std::shared_ptr NodeStatePtr; virtual NodeStatePtr ExportState() = 0; - virtual void ImportState(NodeStatePtr && state) = 0; + virtual void ImportState(const NodeStatePtr & state) = 0; }; typedef IStatefulNode::NodeStatePtr NodeStatePtr; @@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { friend class ComputationNetwork; ComputationNetworkOwnedNodeState() : - m_needsGradient(false) + m_needsGradient(false), m_valueSharable(true) { PurgeStateForFormingRecurrentLoops(); m_isPartOfLoop = false; @@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool IsPartOfLoop() const { return m_isPartOfLoop; } + virtual void MarkValueNonSharable(){ m_valueSharable = false; } + virtual void MarkValueSharable() { m_valueSharable = true; } + bool isValueSharable() const { return m_valueSharable; } + protected: // TODO: should be fully encapsulated here bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree) + bool m_valueSharable; // a flag is needed for memory share. + // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), + // it will never be released to memory pool private: bool m_isPartOfLoop; // true if this loop is part of a recurrent loop @@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name), - m_numRows(0), m_numCols(0) + m_numRows(0), m_numCols(0) { } virtual ~ComputationNodeBase(){} @@ -348,9 +355,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { const TensorShape & GetSampleLayout() const { return m_sampleLayout; } bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; } // meaning does it have a layout that is not just a vector + TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object protected: size_t DetermineElementwiseTensorRank() const; // determine tensor rank when considering all inputs with padding - TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object TensorShape GetTensorSliceFor(size_t rank, const FrameRange & fr) const; // form tensor shape of the slice referenced by FrameRange public: // access to element(0,0) without having to type-cast @@ -455,6 +462,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } + protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -537,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; } bool IsOutputNeededDuringBackprop() const { - return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop; + return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ; } const size_t GetNumInputs() const { return m_inputs.size(); } @@ -769,6 +777,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_parameterUpdateRequired; // update parameters? Only used for LearnableParameters. --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves. bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop + }; typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr; @@ -902,7 +911,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //don't release matrices that need to be used in the gradient computation virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) { - if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE)) + if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } @@ -931,7 +940,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Release the Value matrix only if the output value is needed during backprop // since in the case it isn't used, we release it during forward prop itself - if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE) + if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } } @@ -1317,6 +1326,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_gradient); } + void MarkValueNonSharable() override + { + m_valueSharable = false; + CreateMatrixIfNull(m_value); + } + + protected: // this function is used to create matrices for those needed before matrix pool is available @@ -1532,7 +1548,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */ \ protected: \ typedef shared_ptr> ComputationNodePtr; \ - using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \ + using Base::m_deviceId; using Base::shared_from_this; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; \ + using Base::GetNumRows; using Base::GetNumCols; using Base::GetTensorShape; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \ using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \ using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \ using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \ diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 3d2a7a343..6d36a33a9 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -813,9 +813,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetEvalMode(bool bnEvalMode) { - m_eval = bnEvalMode; + m_eval = bnEvalMode; } - private: struct VersionInfo { diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 898d35f9f..f53ef7a38 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -41,6 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { m_parameterUpdateRequired = true; + this->m_valueSharable = false; SetDims(TensorShape(), 0); } LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) : @@ -48,6 +49,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { m_parameterUpdateRequired = true; CreateMatrixIfNull(m_value); + this->m_valueSharable = false; // for now we split off the trailing dimension into the matrix column dimension // TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops. auto dims = shape.GetDims(); @@ -197,6 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrintNodeValuesToFile(printValues, fstream); } + }; #if 0 @@ -261,6 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { SetDims(sampleLayout, 0); UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that) m_parameterUpdateRequired = false; + this->m_valueSharable = false; } protected: InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) : diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index 52dd27935..07ceba027 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override { - static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); } + //static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); } size_t rank = DetermineElementwiseTensorRank(); auto result = ValueTensorFor(rank, fr); diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index 080f43d44..997f7b209 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -9,6 +9,7 @@ #include "Matrix.h" #include "TensorShape.h" #include "ComputationNode.h" +#include "Sequences.h" #include #include @@ -26,7 +27,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // ----------------------------------------------------------------------- - // ShiftNode (input, fromOffset, boundaryValue, dim=-1, numSteps=1, insertDim=0) -- delay and rolling window + // ShiftNode (input, fromOffset, boundaryValue, dim=-1) -- delay and rolling window // // This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset). // E.g. for fromOffset=-1, this gives the past value. @@ -34,36 +35,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { // // This node can be used in a recurrent loop. This requires special handling by the ComputationNetwork, // for both execution (sequential execution) and creation (avoiding circular references). - // TODO: When outside a recurrent loop and used with frame randomization, this will communicate to the reader - // that additional frames are needed, which will then return a frame range. TODO: This will not match - // the labels, which are still 1 frame. Think through which dimension this should go in. // // Values shifted in from beyond sequence boundaries will be copied from boundaryValue. // Normally, this is a scalar Constant(). However, it can be any node, which will be indexed from the end - // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement - // sequence-to-sequence models. Broadcasting is supported, so it can be e.g. a single output-dimension vector + // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement the basic + // sequence-to-sequence model. Broadcasting is supported, so it can be e.g. a single output-dimension vector // applied to all sequences. // // To delay (past value), use negative fromOffset. To access future value, use positive fromOffset. // - // To pull in multiple offsets, use offsetRange>1. This will pull in offsetRange consecutive offsets starting - // with fromOffset. This implements a rolling window. A new dimension will be inserted at multiOffsetDim - // (default 0 means after the last sample dimension). Special considerations: - // - If the boundaryValue is not wide enough, the sequence will be dropped (e.g. if you pull in 5 history frames, - // but the sequence in boundaryValue only has 4 samples). - // - If you feed back such an expanded output into this node in a loop, you get an inconsistency - // and will eventually fail. You must pull the dimensions apart. - // - If the current time step (offset 0) is included in the range (e.g. fromOffset=-1, offsetRange=3) then - // this node cannot participate in a recurrence. - // // By default, this shifts over the time dimension, but you can choose to shift over any // sample tensor dimension instead using 'dim' (-1 stands for time). This will only work, however, // when all involved nodes are implemented using the tensor library. Nodes implemented using // Matrix slices can only support iterating over time. - // - // If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary - // are dropped). This will initially not be implemented for the time dimension (as it would require - // change of MBLayout). // ----------------------------------------------------------------------- template @@ -74,24 +58,24 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: enum BoundaryMode : int // how to fill frames at boundaries { - reachAcross = -1, // go across the boundary: use boundaryValue. This is for recurrence. - duplicate = 0, // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only. - trim = 1 // drop frames. Non-recurrent mode only. + reachAcross = -1, // go across the boundary: use boundaryValue + duplicate = 0 // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only. }; - ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimension, size_t numSteps, int insertedDimParam) : - Base(deviceId, name), m_fromOffset(fromOffset), m_numSteps(numSteps), + ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimParam) : + Base(deviceId, name), m_fromOffset(fromOffset), m_boundaryMode(boundaryMode), - m_shiftDimension(shiftDimension), m_insertedDimParam(insertedDimParam), - m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/) + m_shiftDimParam(shiftDimParam), + m_shiftDim(SIZE_MAX), + m_state(deviceId) { CreateMatrixIfNull(m_value); SetDims(TensorShape(), 0); // empty for now } ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) : - ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1, 1, 0) + ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1) { } ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) : - ShiftNode(configp->Get(L"deviceId"), L"", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"), configp->Get(L"numSteps"), configp->Get(L"insertedDim")) + ShiftNode(configp->Get(L"deviceId"), L"", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim")) { // We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference. // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below. @@ -111,19 +95,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Save(File& fstream) const { Base::Save(fstream); - fstream << m_fromOffset << m_numSteps << m_boundaryMode << m_shiftDimension << m_insertedDimParam; + fstream << m_fromOffset << m_boundaryMode << m_shiftDimParam; } virtual void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); - fstream >> m_fromOffset >> m_numSteps >> m_boundaryMode >> m_shiftDimension >> m_insertedDimParam; - } - - virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override - { - assert(inputIndex == 0); inputIndex; - fr; + fstream >> m_fromOffset >> m_boundaryMode >> m_shiftDimParam; } virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } @@ -133,6 +111,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Base::BeginForwardProp(); + // TODO: If we have a truncated-BPTT state then verify that the sequence indices match with m_state->m_sequences, and the tensor dimensions. + // in case of trimming, narrow the layout // We actually do not drop content, only reduce the range of sequences. // This is meant to optimize for the case where we have multiple sequences concatenated while trimming a small amount only. @@ -142,34 +122,216 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Base::EndForwardProp(); - // In BPTT, we carry over left-to-right state across minibatches. + // In truncated BPTT, we carry over left-to-right state across minibatches. // The necessary frames are stored in m_state->m_delayedValue. - // Only if layout has anything exceeding the MB. + if (GetMBLayout()->HasSequenceBeyondEnd()) // only if layout has any sequence that has ends beyond this minibatch + { + } + else + m_state.clear(); + } + private: + typedef std::pair, SmallVector> SliceBounds; // slice bounds for dimension k are [first[k], second[k]) (think STL begin/end) + + TensorView DataTensorFor(Matrix & data, TensorShape shape/*original shape of 'data'*/, SliceBounds slice) + { + shape.NarrowTo(slice); + return TensorView(data, shape); } - // This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop. + // helper to shift dimension 'm_shiftDim' of SliceBounds by an offset (a common operation below) + SliceBounds ShiftDim(const SliceBounds & in, int shiftBy) + { + SliceBounds result = in; + result.first [m_shiftDim] += shiftBy; + result.second[m_shiftDim] += shiftBy; + return result; + } + + static SmallVector ToIntDims(const TensorShape & shape) + { + SmallVector dimsSigned; + dimsSigned.append(shape.GetDims().begin(), shape.GetDims().end()); // we need the bounds as signed integers as they may shift into negative ranges + return dimsSigned; + } + + // determine shapes and slices to move + // This is used for both forward and backprop. + // 'In' below refers to Input(0) where 'Out' refers to the output of *this. + void DetermineSlices(size_t rank, const FrameRange & fr, + TensorShape & inShape, TensorShape & outShape, // our MB's shape + SliceBounds & inSliceLogical, SliceBounds & outSliceLogical) // the logical ranges to shift + { + // get the slice bounds for the given FrameRange + outShape = GetTensorShape(rank); // describes the full tensor including sequence and time dimensions + inShape = Input(0)->GetTensorShape(rank); + + // determine the logical in and out slices + // This may now have bounds that fall outside, which we need to split off next. + outSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr, GetMBLayout()); + inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout()); // apply the offset + } + + // determine stripes to move w.r.t. main storage and from/to state + // For efficiency: + // - this function assumes that the return values have been freshly constructed (it won't reset them) + // - it may return a slice with end < begin which indicates an empty slice + void PartitionSlices(const SliceBounds & inSliceLogical, const SliceBounds & outSliceLogical, // the move we want to make + int T, // our actual size + SliceBounds & inSliceMain, SliceBounds & outSliceMain, // the part that goes main-to-main + SliceBounds & inSliceState, SliceBounds & outSliceState) // the part that goes from/to state + { + inSliceMain = inSliceLogical; + outSliceMain = outSliceLogical; + if (inSliceMain.first[m_shiftDim] < 0) + { + assert(inSliceMain.second[m_shiftDim] < T); + if (!m_state.empty()) // truncated BPTT case + { + // determine range that lives in state + SliceBounds inSliceOutside = inSliceMain; // beginning falls to the left of the MB + if (inSliceOutside.second[m_shiftDim] > 0) + inSliceOutside.second[m_shiftDim] = 0; // trim end; e.g. [-2,97) -> [-2,0), but [-2,-1) remains + // now inSliceOutside represents only the region that falls outside + + // map to dimensions of our saved state + SliceBounds inSliceState = ShiftDim(inSliceOutside, m_state.m_shape[m_shiftDim]); + // E.g. for offset = -4, m_state will be 4 elements, so [-2,0) -> [2,4), and [-2,-1) -> [2,3) + + // map to target dimensions + SliceBounds outSliceState = ShiftDim(inSliceOutside, -m_fromOffset); + assert(inSliceState == outSliceState); // (when we fall out on the left, both must be the same) + } + // else: no truncated BPTT means we must have a proper boundary. So don't write those values here, they will be initialized with boundary values below. + + // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end) + outSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim]; + inSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim]; + assert(inSliceMain.first[m_shiftDim] == 0); + } + else if (inSliceMain.second[m_shiftDim] > T) + { + if (!m_state.empty()) + { + // determine range to get from state + SliceBounds inSliceOutside = inSliceMain; + if (inSliceOutside.first[m_shiftDim] < T) + inSliceOutside.first[m_shiftDim] = T; // trim end; e.g. [2,102) -> [100,102), but [101,102) remains + // now inSliceOutside is where we should copy from, with indices completely out of bounds + + // map to dimensions of our saved state + SliceBounds inSliceState = ShiftDim(inSliceOutside, -T); + // E.g. for offset = 4, m_state will be 4 elements, so [100,102) -> [0,2), and [101,102) -> [1,2) + + // map to target dimensions + SliceBounds outSliceState = ShiftDim(inSliceOutside, T - m_fromOffset); + // E.g. [0,2) -> [96,98), and [1,2) -> [97,98) + } + // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end) + outSliceMain.first[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T); + inSliceMain.second[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T); + assert(inSliceMain.second[m_shiftDim] == T); + } + } + public: virtual void ForwardProp(const FrameRange & fr) override { + if (fr.GetIterationDimension() != m_shiftDimParam) + LogicError("ShiftNode::ForwardProp(): FrameRange not iterating over user-specified dimension."); + + // for debugging, invalidate the output region, so we will catch if we missed to update something +#ifdef _DEBUG + ValueFor(fr).Invalidate(); +#endif + // STEP 1: whole-sale copy a shifted version of the input to the output // - consider the saved parts from the last minibatch as part of the input at dimensions beyond the bounds - // - ignore boundary conditions for now + // - ignore boundary conditions at this point (will be fixed subsequently) + // This will copy a little too much in case of multiple concatenated sequences within a single parallel sequence. - // get the tensors without shift + // get the logical ranges we want to shift + TensorShape inShape, outShape; // expanded tensor shapes of input and output + SliceBounds inSliceLogical, outSliceLogical; // the logical ranges to shift size_t rank = DetermineElementwiseTensorRank(); - auto result = ValueTensorFor(rank, fr); - auto input = Input(0)->ValueTensorFor(rank, fr); + DetermineSlices(rank, fr, inShape, outShape, inSliceLogical, outSliceLogical); - // shift the dimension in the input + // now copy the two stripes--one that is main-to-main, and one that pulls in data from previous state (truncated BPTT only) + // This correctly handles if input is a tensor with strides. This is currently not the case, but may be if we support in-place. + + SliceBounds inSliceMain, outSliceMain; // main-to-main + SliceBounds inSliceState, outSliceState; // from state + PartitionSlices(inSliceLogical, outSliceLogical, outShape[m_shiftDim], inSliceMain, outSliceMain, inSliceState, outSliceState); + + if (!inSliceState.first.empty() && inSliceState.second[m_shiftDim] > inSliceState.first[m_shiftDim]) + { + // Note: If all sequences begin at the start of the range, this would copy invalid values which would be overwrittten below. + // This is prevented in that m_state will be set to empty in the previous MB if all sequences ended, which will in turn return an empty slice. + auto from = DataTensorFor(m_state.m_delayedValue, m_state.m_shape, inSliceState); + auto to = DataTensorFor(Value(), outShape, outSliceState); + to.AssignCopyOf(from); + } + if (inSliceMain.second[m_shiftDim] > inSliceMain.first[m_shiftDim]) + { + auto from = DataTensorFor(Input(0)->Value(), inShape, inSliceMain); + auto to = DataTensorFor( Value(), outShape, outSliceMain); + to.AssignCopyOf(from); + } + // We have now pulled anything from within the logical bounds. + // Any frame that pulls from outside contains invalid values (either not initialized or copied from incorrect source), which must be fixed next. // STEP 2: fix up the boundary conditions - // - fill in xxx + // - fill in all frames that are too close to boundary and must be filled from context (recurrent) or by replication (non-recurrent only) - // turn selected frame and shifted frame into a tensor + if (fr.IsAllFrames() || GetMBLayout()->IsBeyondStartOrEnd(fr.WithTimeOffset(m_fromOffset))) // short-cut test whether there is anything to do + { + auto ts = outSliceLogical.first[m_shiftDim]; + auto te = outSliceLogical.second[m_shiftDim]; + //size_t sequenceDim = outShape.size() - 2; // TODO: In case of multiple time dims, this must be adjusted. Code dup from TensorSliceWithMBLayoutFor(). Encapsulate this. + // iterate over all sequences in this batch and handle all that overlap with the target region + for (const auto & seq : GetMBLayout()->GetAllSequences()) + { + if (seq.tEnd <= ts || seq.tBegin >= te) // no overlap--skip + continue; - // copy all that's in range + // get tensor to fill in. This may be out of bounds, and may only partially overlap with [ts,te) + auto seqLen = abs(m_fromOffset); + auto seqBegin = m_fromOffset < 0 ? seq.tBegin : seq.tBegin + seq.GetNumTimeSteps() - seqLen; // e.g. m_fromOffset = -4 -> [0,4) , +4 -> [Len-4,Len) + auto outSliceFill = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr.WithTimeOffset(seqBegin).WithTimeRange(seqLen).Sequence(seq.s), GetMBLayout()); - // fix up all that is not + // get tensor to fill from + // We fill either from the provided boundary node or from ourselves (BoundaryMode::duplicate = clamp). + bool clamp = m_boundaryMode == BoundaryMode::duplicate; + ComputationNodeBasePtr boundaryNode = clamp ? shared_from_this() : Input(0); + auto boundaryShape = boundaryNode->GetTensorShape(rank); + auto fromSeq = clamp ? + seq.s : + boundaryNode->HasMBLayout() ? + boundaryNode->GetMBLayout()->FindSequence(seq.seqId).seqId : + SIZE_MAX; + auto fromBegin = 0; + auto boundarySliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(boundaryShape), fr.WithTimeOffset(fromBegin).WithTimeRange(seqLen).Sequence(fromSeq), GetMBLayout()); + + boundarySliceLogical; + + //inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout()); // apply the offset + + + + // clip against [ts,te) + // copy + sin(1); + } + } + } + + virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override + { + // To allow for bulk gradient computation, we will clear out any gradient that should not be propagated. + // We do that directly to our incoming output gradient. This is OK because we own this, and it is no longer used after this operation + // (it is invalid to call BackpropTo() multiple times since it adds to the outgoing Input() gradient). + assert(inputIndex == 0); inputIndex; + fr; } virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override @@ -177,46 +339,29 @@ namespace Microsoft { namespace MSR { namespace CNTK { assert(m_inputs.size() == 2); ComputationNodeBase::Validate(isFinalValidationPass); - if (isFinalValidationPass) - sin(1.0f); - // MBLayout is just inherited m_pMBLayout = Input(0)->GetMBLayout(); if (isFinalValidationPass && !m_pMBLayout) InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str()); - // determine final sample layout - auto inputSampleLayout = Input(0)->GetSampleLayout(); - auto inputDims = inputSampleLayout.GetDims(); - if (m_insertedDimParam < 0) - InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.", - NodeName().c_str(), OperationName().c_str(), m_insertedDimParam); - m_insertExpandShapeAt = m_numSteps > 1 ? 0 : (m_insertedDimParam > 0 ? m_insertedDimParam - 1 : inputDims.size()); - if (m_insertExpandShapeAt > inputDims.size()) - if (isFinalValidationPass) - InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].", - NodeName().c_str(), OperationName().c_str(), m_insertedDimParam, string(inputSampleLayout).c_str()); - else - m_insertExpandShapeAt = inputDims.size(); // this may be an error, but we want to catch that only in the final pass - SmallVector dims; - if (m_numSteps > 1 && inputDims.size() + 1 > dims.capacity()) - InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?", - NodeName().c_str(), OperationName().c_str()); - dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt); - if (m_numSteps > 1) // insert the new dimension if we expand into more than one step - dims.push_back(m_numSteps); - dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end()); - auto sampleLayout = TensorShape(dims); + // as is the sample layout + SetDims(Input(0)); - SetDims(sampleLayout, 0); + // determine the dimension that is to be shifted (convert user-specified as a zero-based index) + if (isFinalValidationPass) + { + size_t rank = DetermineElementwiseTensorRank(); + auto valueShape = GetTensorShape(rank); // bounds of the Value() + m_shiftDim = m_shiftDimParam > 0 ? m_shiftDimParam - 1/*regular dimensions are specified as 1-based*/ : valueShape.size() + m_shiftDimParam/*-1 for time dimension*/; + } } // special interface for use by loop detection virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override { - if (m_boundaryMode != BoundaryMode::reachAcross) + if (m_boundaryMode != BoundaryMode::reachAcross) // duplicating boundary frames cannot be done with recurrence return 0; - else if (m_fromOffset + (int)m_numSteps <= 0) + else if (m_fromOffset < 0) return +1; else if (m_fromOffset > 0) return -1; @@ -231,48 +376,61 @@ namespace Microsoft { namespace MSR { namespace CNTK { { auto node = dynamic_pointer_cast>(nodeP); node->m_fromOffset = m_fromOffset; - node->m_numSteps = m_numSteps; node->m_boundaryMode = m_boundaryMode; - node->m_shiftDimension = m_shiftDimension; - node->m_insertedDimParam = m_insertedDimParam; - node->m_insertExpandShapeAt = m_insertExpandShapeAt; + node->m_shiftDimParam = m_shiftDimParam; + node->m_shiftDim = m_shiftDim; node->m_state = m_state; } } class ShiftNodeState : public INodeState { - Matrix m_delayedValue; // saves the activation of the previous step that this node points to - vector m_delayedSequences; // and associated sequence info. This is only used for consistency checking (it must match). + public: + Matrix m_delayedValue; // saves the activation of the previous step that this node points to + TensorShape m_shape; // tensor shape that describes m_delayedValue + vector m_delayedSequences; // and associated sequence info. This is only used for consistency checking (it must match). ShiftNodeState(DEVICEID_TYPE deviceId) : m_delayedValue(deviceId) { } + bool empty() const { return m_delayedSequences.empty(); } + void clear() { m_delayedValue.Resize(0, 0); m_shape = TensorShape(); m_delayedSequences.clear(); } }; typedef std::shared_ptr ShiftNodeStatePtr; // state export/import - // This is done with a shared_ptr. The moment state is exported, the internal state is cleared; ownership is transferred to the exporting entity. - // This way, the next invocation does not overwrite the exported state, but is required to create a new one if needed. - // On the other hand, once imported, the state object is owned by the node and will be overwritten with the next state. - virtual NodeStatePtr ExportState() { return std::move(m_state); } - virtual void ImportState(NodeStatePtr && state) override + // This is done with a shared_ptr. The current state is exported, the internal state is cleared. + // Ownership of members is logically transferred to the exporting entity. + // Physically, however, since we often transfer between CPU and GPU, activation data is merely copied, + // and the GPU or CPU object resized to (0,0) without giving up the memory. + virtual NodeStatePtr ExportState() // TODO: can we instead pass the shared_ptr object in? So we don't need to create a new one all the time? Or should we still take ownership of the ptr? { - m_state = dynamic_pointer_cast(state); - if (state && !m_state) + auto state = make_shared(CPUDEVICE); + state->m_delayedValue.SetValue(m_state.m_delayedValue); // note: this will transfer from GPU to CPU + m_state.m_delayedValue.Resize(0, 0); + state->m_shape = std::move(m_state.m_shape); + state->m_delayedSequences = std::move(m_state.m_delayedSequences); + return state; + } + virtual void ImportState(const NodeStatePtr & statep) override + { + ShiftNodeStatePtr state = dynamic_pointer_cast(statep); + if (!state) LogicError("ImportState: Wrong state object passed (wrong type)."); + m_state.m_delayedValue.SetValue(state->m_delayedValue); // note: this will transfer from CPU to GPU + state->m_delayedValue.Resize(0, 0); + m_state.m_shape = std::move(state->m_shape); + m_state.m_delayedSequences = std::move(state->m_delayedSequences); } protected: // parameters remembered from construction - int m_fromOffset; // offset to pull from - int m_numSteps; // offset range - BoundaryMode m_boundaryMode; // how to fill at the boundary (reach across, duplicate, or trim) - int m_shiftDimension; // dimension to shift (default: time) - int m_insertedDimParam; // in case of multiple steps, this is where a new dimension will be inserted + int m_fromOffset; // offset to pull from + BoundaryMode m_boundaryMode; // how to fill at the boundary (reach across or duplicate) + int m_shiftDimParam; // dimension to shift (default: time) - // derived params set up in Validate() - size_t m_insertExpandShapeAt; // at which dimension to insert (internal 0-based index) + size_t m_shiftDim; // m_shiftDimParam matched to the real tensor index - ShiftNodeStatePtr m_state; // saves the activation of the previous step that this node points to + ShiftNodeState m_state; // state that is carried over across evaluations + // Note: The version held by this node lives in the GPU, whereas the versions being exported carry CPU-side copies - function m_attachInputsFn; // for late expansion of inputs (scripting) + function m_attachInputsFn; // for late expansion of inputs (scripting) }; // ----------------------------------------------------------------------- @@ -333,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window) // - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window) // - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed) - // - support for Yongqiang’s sub-minibatching with BPTT (export/import state) + // - support for Yongqiang’s sub-minibatching with truncated BPTT (export/import state) // - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch) // ----------------------------------------------------------------------- @@ -486,7 +644,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void EndForwardProp() override // called after last iteration step of ForwardProp() { - // In BPTT, we carry over left-to-right state across minibatches. + // In truncated BPTT, we carry over left-to-right state across minibatches. // It is kept in m_delayedValue, m_delayedActivationMBLayout. // This could be optimized as follows: // - only keep the required number of frames (m_timeStep) @@ -620,27 +778,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (dir == -1) // we look into past { -#if 0 - bool allAtBoundary = true; - // if the current last frames are all sentence end or no feature , there is no need to carry on state info - if (m_pMBLayout->Is(FrameRange(nT-1), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature)) - { - for (size_t u = 0; u < nU; u++) - { - if (!m_pMBLayout->Is(FrameRange(nT - 1).Sequence(u), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature)) - { - allAtBoundary = false; - break; - } - } - } - else - { - allAtBoundary = false; - } - - if (allAtBoundary) -#endif if (!m_pMBLayout->HasSequenceBeyondEnd()) // only need to export state if anything crosses the MB boundary { auto pState = make_shared>(m_deviceId); @@ -655,26 +792,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { pExportedState = pState; } } - if (dir == 1) // we look into future + else if (dir == 1) // we look into future { -#if 0 - // TODO: check whether all at boundary and don't carry state if it is the case - size_t nT = m_pMBLayout->GetNumTimeSteps(); - size_t nU = m_pMBLayout->GetNumParallelSequences(); - bool allAtBoundary = true; - if (m_pMBLayout->Is(FrameRange(nullptr, 0), MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart)) - { - for (size_t u = 0; u < nU; u++) - { - if (!m_pMBLayout->Is(FrameRange(nullptr, 0).Sequence(u), MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature)) - { - allAtBoundary = false; - break; - } - } - } - if (allAtBoundary) -#endif if (!m_pMBLayout->HasSequenceBeyondBegin()) // only need to export state if anything crosses the MB boundary { auto pState = make_shared>(m_deviceId); @@ -689,19 +808,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { pExportedState = pState; } } - if (dir != -1 && dir != 1) + else { - RuntimeError("Unrecognized direction in DelayedValueNodeBase"); + LogicError("Unrecognized direction in DelayedValueNodeBase"); } return pExportedState; } - virtual void /*IStatefulNode::*/ImportState(NodeStatePtr && pImportedState) override + virtual void /*IStatefulNode::*/ImportState(const NodeStatePtr & pImportedState) override { DelayedNodeStatePtr pState = dynamic_pointer_cast> (pImportedState); if (!pState) - RuntimeError("Expecting DelayValueNodeState after down casting"); + LogicError("Expecting DelayValueNodeState after downcasting"); pState->ExportDelayedMBLayout(m_delayedActivationMBLayout); // pstate copy to m_delayedActivationMBLayout if (pState->IsEmpty()) @@ -715,18 +834,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { int dir = direction; if (dir == -1) // looking backward - { m_delayedValue.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU); - } - if (dir == 1) - { - //m_delayedValue.CopyColumnsStrided(delayedActivation, nU, 1, nT); + else if (dir == 1) m_delayedValue.SetColumnSlice(delayedActivation, 0, nU); - } - if (dir != -1 && dir == 1) - {// it is really a compile error ? - RuntimeError("Unrecognized direction in DelayedValueNodeBase"); - } + else + LogicError("Unrecognized direction in DelayedValueNodeBase"); } protected: diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h index 0b73d69b7..a0f00586c 100644 --- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h +++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h @@ -1234,8 +1234,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else if (inputIndex == 1) { - BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), - Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); + FrameRange fr(Input(0)->GetMBLayout()); + BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), + Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); + MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr); + #ifdef _DEBUG Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout())); #endif @@ -1368,14 +1371,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { RequestMatrixFromPool(m_gammaFromLattice, matrixPool); } - // Release gradient and temp matrices that are no longer needed after all the children's gradients are computed. - virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) - { - Base::ReleaseMatricesAfterBackprop(matrixPool); - ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool); - ReleaseMatrixToPool(m_softmaxOfRight, matrixPool); - ReleaseMatrixToPool(m_gammaFromLattice, matrixPool); - } + //request matrices needed to do node function value evaluation + virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) + { + Base::ReleaseMatricesAfterBackprop(matrixPool); + ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool); + ReleaseMatrixToPool(m_softmaxOfRight, matrixPool); + ReleaseMatrixToPool(m_gammaFromLattice, matrixPool); + } // TODO: method names should be CamelCase std::vector> * getLatticePtr() @@ -1415,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_doReferenceAlignment = doreferencealign; } + void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR) + { + msra::lattices::SeqGammarCalParam param; + param.amf = amf; + param.lmf = lmf; + param.wp = wp; + param.bMMIfactor = bMMIfactor; + param.sMBRmode = sMBR; + m_gammaCalculator.SetGammarCalculationParams(param); + } + void gettime(unsigned long long &gammatime, unsigned long long &partialtime) { gammatime = m_gammatime; @@ -1427,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { shared_ptr> m_gammaFromLattice; double m_frameDropThreshold; double m_fsSmoothingWeight; // frame-sequence criterion interpolation weight --TODO: can this be done outside? + double m_seqGammarAMF; + double m_seqGammarLMF; + double m_seqGammarWP; + double m_seqGammarbMMIFactor; + double m_seqGammarUsesMBR; bool m_doReferenceAlignment; std::vector> m_lattices; msra::asr::simplesenonehmm m_hmm; diff --git a/Source/EvalDll/EvalDll.vcxproj b/Source/EvalDll/EvalDll.vcxproj index 71e515bc8..a535ca3ff 100644 --- a/Source/EvalDll/EvalDll.vcxproj +++ b/Source/EvalDll/EvalDll.vcxproj @@ -74,7 +74,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib" @@ -102,7 +102,7 @@ Speed - Windows + Console true true true diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 65decd1b4..153dff585 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -137,7 +137,7 @@ struct GridDim std::vector props(numDevices); for (int i = 0; i < numDevices; i++) CUDA_CALL(cudaGetDeviceProperties(&props[i], i)); -#if 1 // on Linux, maxGridSize[0] gets reported as 0 +#if 0 // on Linux, maxGridSize[0] gets reported as 0 for (int i = 0; i < numDevices; i++) fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name); #endif diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index 7e4f7a1c6..3d4635020 100644 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { slice.m_computeDevice = m_computeDevice; slice.m_numRows = m_numRows; slice.m_numCols = numCols; - slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn); + slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn); slice.m_elemSizeAllocated = m_elemSizeAllocated; slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated; slice.m_pArray = m_pArray; diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index 63234dabe..3f125330a 100644 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0)); } + // TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated. size_t MajorIndexCount() const { - return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format); + return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format); } size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const { @@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { return MajorIndexLocation() + m_numRows; else return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset; + //return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset; } size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const { diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj index f33a6328b..d79abfb28 100644 --- a/Source/Math/Math.vcxproj +++ b/Source/Math/Math.vcxproj @@ -79,7 +79,7 @@ true - Windows + Console true libacml_mp_dll.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\ @@ -127,7 +127,7 @@ MultiThreadedDLL - Windows + Console true true true diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj index 7fcb5807a..ad29f39a4 100644 --- a/Source/Math/MathCUDA.vcxproj +++ b/Source/Math/MathCUDA.vcxproj @@ -91,7 +91,7 @@ true - Windows + Console true cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies) true diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 3650db859..52a937579 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK { } template - void Matrix::NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum) + void Matrix::NormalGrad(Matrix& gradients, + Matrix& functionValues, + const ElemType learnRatePerSample, + const ElemType momentum, + const bool useNesterovMomentum + ) { DecideAndMoveToRightDevice(*this, gradients, functionValues); - - DISPATCH_MATRIX_ON_FLAG(&gradients, + + if (!useNesterovMomentum) + { + DISPATCH_MATRIX_ON_FLAG(&gradients, nullptr, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues), if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues) ); + } + else + { + DISPATCH_MATRIX_ON_FLAG(&gradients, + nullptr, + {/* CPU dense */ + ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues); + // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient, + }, + {/* GPU dense */ + ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues); + }, + { /* CPU sparse */ + if (momentum != 0) + { + Matrix gradientCache(gradients.GetDeviceId()); + gradientCache.SetValue(gradients); + gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); + } + }, + { /* GPU sparse */ + if (momentum != 0) + { + Matrix gradientCache(gradients.GetDeviceId()); + gradientCache.SetValue(gradients); + gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); + } + } + ); + } } //both this and gradients will be changed diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 379169529..94eb0dd53 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ShiftBy(int numShift); // TODO: all these scalars should be passed as doubles and cast down inside - void NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum); + void NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG); ElemType Adagrad(Matrix& gradients, const bool needAveMultiplier); void FSAdagrad(size_t mbSize, Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum); ElemType RmsProp(Matrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index e032f2299..9d343eeea 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -237,8 +237,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void TensorView::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op) { - static int cc = 0; if (cc++ == 0) - fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str()); + //static int cc = 0; if (cc++ == 0) + // fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str()); // prepare all tensor descriptor information as needed for execution array offsets; @@ -257,8 +257,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op) { - static int cc = 0; if (cc++ == 0) - fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str()); + //static int cc = 0; if (cc++ == 0) + // fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str()); array offsets; array, 3> regularStrides, reducingStrides; @@ -275,8 +275,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void TensorView::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op) { - static int cc = 0; if (cc++ == 0) - fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str()); + //static int cc = 0; if (cc++ == 0) + // fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str()); array offsets; array, 4> regularStrides, reducingStrides; diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h index 876e3c6a8..b2b7d4b08 100644 --- a/Source/Math/latticefunctionskernels.h +++ b/Source/Math/latticefunctionskernels.h @@ -356,26 +356,39 @@ struct latticefunctionskernels const size_t te = ts + numframes; // end time of current unit size_t state1step0to1 = te; // inflection point from state 0 to 1, record in state 1 + //size_t state1stepm1to1 = te; size_t state2step0to1 = te; // inflection point from state 0 to 1, record in state 2 + //size_t state2stepm1to1 = te; // inflection point from state 0 to 1, record in state 2 size_t state2step1to2 = te; // inflection point from state 1 to 2, record in state 2 + size_t state2step0to2 = te; //now we only support transition from -1 to 0 or 2 for sil - float pathscore0 = fwscore ; // log pp in state 0 - float pathscore1 = LOGZERO; // log pp in state 1 - float pathscore2 = LOGZERO; // log pp in state 2 - if(isSil) - pathscore2 = fwscore; + float pathscore0 = fwscore; // log pp in state 0 + float pathscore1 = fwscore; // log pp in state 1 + float pathscore2 = fwscore; // log pp in state 2 + + // first frame if (ts != te) // for t = ts, initialization { - if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted + /* if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted { pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts); pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts); } - else //for others, only -1 to 0 is permitted - pathscore0 += logLLs(senoneid0,ts); // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway + else //for others, only -1 to 0 is permitted + { + pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); + pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); + + }*/ + pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts); + pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); + //state1stepm1to1 = ts; + pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); + + } @@ -399,17 +412,22 @@ struct latticefunctionskernels { pathscore2 = pathscore12; state2step0to1 = state1step0to1; // record the inflection point + //state2stepm1to1 = state1stepm1to1; state2step1to2 = t; // record the inflection point + state2step0to2 = te; if (isSil) backptrmatrix (2, t-ts-1) = 1; } - if (isSil) // only silence have path from 0 to 2 + //if (isSil) // only silence have path from 0 to 2 { const float pathscore02 = pathscore0 + getlogtransp(transP,0,2); // log pp from state 0 to 2 if (pathscore02 >= pathscore2) // if state 0->2 { pathscore2 = pathscore02; - backptrmatrix (2, t-ts-1) = 0; + if (isSil) + backptrmatrix (2, t-ts-1) = 0; + state2step0to2 = t; + state2step1to2 = te; } } @@ -422,9 +440,11 @@ struct latticefunctionskernels { pathscore1 = pathscore01; state1step0to1 = t; // record the inflection point + //state1stepm1to1 = te; if (isSil) backptrmatrix (1, t-ts-1) = 0; } + if (isSil) // only silence have path from 2 to 1 { const float pathscore21 = pathscore2last + getlogtransp(transP,2,1); @@ -495,19 +515,35 @@ struct latticefunctionskernels if (!isSil) { - state2step0to1 += alignindex - ts; // convert to align measure - state2step1to2 += alignindex - ts; - for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment - { - size_t senoneid; - if (t < state2step0to1) // in state 0 - senoneid = senoneid0; - else if(t < state2step1to2) // in state 1 - senoneid = senoneid1; - else // in state 2 - senoneid = senoneid2; - alignresult[t] = (unsigned short) senoneid; - } + if (state2step0to2 < te) //from 0 to 2 + { + state2step0to2 += alignindex - ts; + for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment + { + size_t senoneid; + if (t < state2step0to2) // in state 0 + senoneid = senoneid0; + else // in state 2 + senoneid = senoneid2; + alignresult[t] = (unsigned short)senoneid; + } + } + else //from 1 to 2 + { + state2step0to1 += alignindex - ts; // convert to align measure + state2step1to2 += alignindex - ts; + for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment + { + size_t senoneid; + if (state2step0to1 true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -91,7 +91,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/DSSMReader/DSSMReader.vcxproj b/Source/Readers/DSSMReader/DSSMReader.vcxproj index 1412fac38..d607a7c9f 100644 --- a/Source/Readers/DSSMReader/DSSMReader.vcxproj +++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj index 8a422f187..438c7daed 100644 --- a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj +++ b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj @@ -100,7 +100,7 @@ true - Windows + Console true $(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories) @@ -115,7 +115,7 @@ true - Windows + Console true $(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories) ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -133,7 +133,7 @@ true - Windows + Console true true true @@ -152,7 +152,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp index 0db717a99..ecc6283f6 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp @@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { vector scriptpaths; vector RootPathInScripts; + wstring RootPathInLatticeTocs; vector mlfpaths; vector>mlfpathsmulti; size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing @@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { expand_wildcards(thisLattice(L"numLatTocFile"), paths); latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end()); } - + RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L""); } //get HMM related file names @@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!_wcsicmp(readMethod.c_str(), L"blockRandomize")) { // construct all the parameters we don't need, but need to be passed to the constructor... - m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap())); + + m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs)); + m_lattices->setverbosity(m_verbosity); // now get the frame source. This has better randomization and doesn't create temp files m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode)); @@ -941,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (!skip) { + // a stopgap + if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]) + { + // BUGBUG: we just found that (due to some bugs yet to be tracked down), + // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs) + // This is just a stopgap, to be removed after the bugs are found and fixed + bool needRenew = true; + while (needRenew) + { + size_t framenum = m_numFramesToProcess[i]; + fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", + (int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str()); + ReNewBufferForMultiIO(i); + needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; + } + + } m_numValidFrames[i] = m_numFramesToProcess[i]; if (m_numValidFrames[i] > 0) { @@ -972,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_extraNumSeqs = 0; if (!m_frameMode) { - // insert extra utterances to parallel sequences that have enough space left - // As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in. - size_t nextMinibatchUttnum = 0; - bool inserted; - // The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB. - // For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j]. - for (size_t i = 0; i < m_numSeqsPerMB; i++) + for (size_t src = 0; src < m_numSeqsPerMB; ) { - while (nextMinibatchUttnum <= i) + size_t framenum = m_numFramesToProcess[src]; + if (framenum == 0) { - size_t framenum = m_numFramesToProcess[i]; - inserted = false; - if (framenum > 0) // non-empty entry: see were it fits - { - // greedily search for a parallel sequence with enough space at the end to insert this utterance - for (size_t j = 0; j < m_numSeqsPerMB; j++) - { - if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps) - { - // enough space: insert it as parallel sequence [j] (instead of [i] in the next MB) - m_extraSeqsPerMB.push_back(j); - if (m_latticeBufferMultiUtt[i] != nullptr) - { - m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]); - m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]); - m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]); - } - fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i); - m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum); + src++; + continue; + } + if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum) + { + // BUGBUG: we just found that (due to some bugs yet to be tracked down), + // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs) + // This is just a stopgap, to be removed after the bugs are found and fixed + fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", + (int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str()); + src++; + continue; + } - // consume it - ReNewBufferForMultiIO(i); // replace current [i] with a new one; then try again with this new one at [i] - m_numValidFrames[j] += framenum; - m_extraNumSeqs++; - inserted = true; - break; - } + bool slotFound = false; + for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot + { + if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps) + { // found ! + m_extraSeqsPerMB.push_back(des); + if (m_latticeBufferMultiUtt[src] != nullptr) + { + m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]); + m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]); + m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]); } + fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src); + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum); + + ReNewBufferForMultiIO(src); + m_numValidFrames[des] += framenum; + m_extraNumSeqs++; + slotFound = true; + break; } - if (!inserted) - { - nextMinibatchUttnum++; // didn't fit anywhere: done with entry [i] - } + } + if (!slotFound) + { + src++; // done with this source; try next source; } } diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.h b/Source/Readers/HTKMLFReader/HTKMLFReader.h index fd6015c28..7e64ee3e8 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.h +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h @@ -32,6 +32,9 @@ private: intargvector m_numSeqsPerMBForAllEpochs; size_t m_numSeqsPerMB; // requested number of parallel sequences size_t m_mbNumTimeSteps; // number of time steps to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported) + size_t m_mbMaxNumTimeSteps; // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued ) + // this is used to prevent CUDA out-of memory errors + vector m_numFramesToProcess; // [seq index] number of frames available (left to return) in each parallel sequence vector m_switchFrame; /// TODO: something like the position where a new sequence starts; still supported? vector m_numValidFrames; // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput. diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj index de7772889..fd8f9c343 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj @@ -69,7 +69,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -87,7 +87,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/ImageReader/ImageReader.vcxproj b/Source/Readers/ImageReader/ImageReader.vcxproj index b5061adaf..7d3a3b01c 100644 --- a/Source/Readers/ImageReader/ImageReader.vcxproj +++ b/Source/Readers/ImageReader/ImageReader.vcxproj @@ -75,7 +75,7 @@ true - Windows + Console true Math.lib;$(OpenCVLib);%(AdditionalDependencies) diff --git a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj index 24a8a1112..93b527173 100644 --- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj +++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj @@ -71,7 +71,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -92,7 +92,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj index bb68dd89d..a73d0af74 100644 --- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj +++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj @@ -71,7 +71,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -92,7 +92,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj index e3a10c534..e5d8ac1fb 100644 --- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj +++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj index 72d18defe..db66c6d31 100644 --- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj +++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj index fc0e03ffa..e30dc6b90 100644 --- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj +++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj @@ -70,7 +70,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -91,7 +91,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/UCIReader/UCIReader.vcxproj b/Source/Readers/UCIReader/UCIReader.vcxproj index 2e25c2b57..08cce8205 100644 --- a/Source/Readers/UCIReader/UCIReader.vcxproj +++ b/Source/Readers/UCIReader/UCIReader.vcxproj @@ -91,7 +91,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -107,7 +107,7 @@ ..\..\common\include;..\..\Source\Math - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -124,7 +124,7 @@ true - Windows + Console true true true @@ -144,7 +144,7 @@ ..\..\common\include;..\..\Source\Math - Windows + Console true true true diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h index 19f3f2025..a4851fdf8 100644 --- a/Source/SGDLib/MultiNetworksSGD.h +++ b/Source/SGDLib/MultiNetworksSGD.h @@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { using SGDBase::m_L2RegWeight; using SGDBase::m_L1RegWeight; using SGDBase::m_needAveMultiplier; + using SGDBase::m_useNesterovMomentum; using SGDBase::m_traceLevel; using SGDBase::m_numMBsToShowResult; using SGDBase::m_gradientCheckSigDigit; @@ -392,8 +393,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (m_loadBestModel) { - encoderNet->ReloadPersistableParameters(GetEncoderModelNameForEpoch(i - 1)); - decoderNet->ReloadPersistableParameters(GetDecoderModelNameForEpoch(i - 1)); + encoderNet->RereadPersistableParameters(GetEncoderModelNameForEpoch(i - 1)); + decoderNet->RereadPersistableParameters(GetDecoderModelNameForEpoch(i - 1)); size_t dummyMinibatchSize = 0; this->LoadCheckPointInfo(i - 1, @@ -721,7 +722,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //persist model and check-point info for (size_t k = 0; k < iNumNetworks; k++) { - nets[k]->ReloadPersistableParameters(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k))); + nets[k]->RereadPersistableParameters(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k))); nets[k]->ResetEvalTimeStamps(); } @@ -930,7 +931,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Matrix& smoothedGradient = (*smoothedGradientIter); - UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier); + UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum); } } } diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 8fe60474f..6665815a7 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -310,7 +310,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // likewise for sequence training parameters if (isSequenceTrainingCriterion) { - ComputationNetwork::SetSeqParam(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign); + ComputationNetwork::SetSeqParam(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, + m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR ); } // --- MAIN EPOCH LOOP @@ -519,6 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1)) { g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank()); + g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank()); } bool loadedPrevModel = false; @@ -543,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval); fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str()); - net->ReloadPersistableParameters(bestModelPath); + net->RereadPersistableParameters(bestModelPath); LoadCheckPointInfo(i - m_learnRateAdjustInterval, /*out*/ totalSamplesSeen, /*out*/ learnRatePerSample, @@ -771,13 +773,20 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM. DataReaderHelpers::SubminibatchDispatcher smbDispatcher; size_t numSubminibatchesNeeded = 0; - if (m_maxSamplesInRAM < SIZE_MAX) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled + if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled { - // into how many pieces would we need to break the minibatch? - // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. - size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); - size_t estimatedMBSize = tunedMBSize * numParallelSequences; - numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + { + // into how many pieces would we need to break the minibatch? + // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. + size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); + size_t estimatedMBSize = tunedMBSize * numParallelSequences; + numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + } + if (m_numSubminiBatches > 1) + { + numSubminibatchesNeeded = m_numSubminiBatches; + } } // this is non-trivial, we need a manager object to handle this if (numSubminibatchesNeeded > 1) @@ -807,7 +816,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (numSubminibatchesNeeded > 1) { - fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + else + fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded); } fprintf(stderr, ".\n"); @@ -998,7 +1010,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples, m_L2RegWeight, m_L1RegWeight, - m_needAveMultiplier); + m_needAveMultiplier, m_useNesterovMomentum); #ifdef _DEBUG if (dynamic_pointer_cast>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): ")) LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str()); @@ -1438,7 +1450,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } int baseModelEpoch = epochNumber - 1; - net->ReloadPersistableParameters(GetModelNameForEpoch(baseModelEpoch)); + net->RereadPersistableParameters(GetModelNameForEpoch(baseModelEpoch)); double learnRate = learnRatePerSample; size_t dummyMinibatchSize = 0; @@ -1598,7 +1610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } int baseModelEpoch = epochNumber - 1; - net->ReloadPersistableParameters(GetModelNameForEpoch(baseModelEpoch)); + net->RereadPersistableParameters(GetModelNameForEpoch(baseModelEpoch)); double dummyLearnRate; double dummtPrevCriterion; @@ -2029,7 +2041,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) + const bool needAveMultiplier, + const bool useNesterovMomentum + ) { // we use simple linear (instead of log linear) scaling here const double momentum = MomentumPerMB(momentumPerSample, actualMBSize); @@ -2070,7 +2084,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (adpType == GradientsUpdateType::None) { smoothedGradient.NormalGrad(gradientValues, functionValues, - (ElemType)learnRatePerSample, (ElemType)momentum); + (ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum); } else if (adpType == GradientsUpdateType::AdaGrad || (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) || @@ -2120,7 +2134,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { const double momentumPerSample, const size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) const + const bool needAveMultiplier, + const bool useNesterovMomentum + ) const { #if DUMPOUTPUT fprintf(stderr, "Update_%ls\n", node->NodeName().c_str()); @@ -2131,7 +2147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { UpdateWeightsS(this, dynamic_pointer_cast>(node)->Value(), dynamic_pointer_cast>(node)->Gradient(), smoothedGradient, learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight, - needAveMultiplier); + needAveMultiplier, m_useNesterovMomentum); node->BumpEvalTimeStamp(); } @@ -2501,6 +2517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector{ 256 }))); m_truncated = configSGD(L"truncated", false); m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX); + m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1); // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = configSGD(L"epochSize", (size_t)0); @@ -2520,6 +2537,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector())); floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector())); floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector())); + bool useNesterovMomentum = configSGD(L"useNAG", false); + m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0); @@ -2534,6 +2553,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95); m_frameDropThresh = configSGD(L"frameDropThresh", 1e-10); m_doReferenceAlign = configSGD(L"doReferenceAlign", false); + m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); + m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0); + m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0); + m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); + m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0); m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector{ 0.0f }))); @@ -2639,6 +2663,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_momentumParam = floatargvector(L"0.9"); m_momentumSpecifiedForMBSize = m_mbSize; } + m_useNesterovMomentum = useNesterovMomentum; + for (int i = 0; i < m_momentumParam.size(); i++) { if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0) diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 15143dfa0..a014ec1d2 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -111,6 +111,7 @@ protected: intargvector m_learningRatesSpecifiedForMBSize; // 1 for per sample, m_mbSize[] for per MB floatargvector m_momentumParam; intargvector m_momentumSpecifiedForMBSize; + bool m_useNesterovMomentum; // Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value. // MB size is the number of samples across all time steps and parallel sequences. @@ -157,7 +158,11 @@ protected: // To mitigate this issue, we adopt the sub-minibatch implementation, where // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch - + size_t m_numSubminiBatches; + // alternative method to specify how to split minibatches into subminibatches + // default is 1, which means no subminibatch is used + // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 + // we divide one minibatch to m_numSubminiBatches subMinibatches // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; @@ -245,6 +250,11 @@ protected: double m_hSmoothingWeight; double m_frameDropThresh; bool m_doReferenceAlign; + double m_seqGammarCalcAMF; + double m_seqGammarCalcLMF; + double m_seqGammarCalcWP; + double m_seqGammarCalcbMMIFactor; + bool m_seqGammarCalcUsesMBR; }; template class IDistGradAggregator; @@ -436,7 +446,9 @@ public: size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier); + const bool needAveMultiplier, + const bool useNesterovMomentum + ); protected: // UpdateWeights - update the weights in @@ -446,7 +458,8 @@ protected: const double momentumPerSample, const size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) const; + const bool needAveMultiplier, + const bool useNesterovMomentum) const; void ClipGradient(Matrix& gradient, const size_t actualMBSize) const; diff --git a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj index c7c9d4073..b37973541 100644 --- a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj +++ b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj @@ -44,7 +44,7 @@ true - Windows + Console true diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h index 4ad7d8f46..f63c50487 100644 --- a/Source/SequenceTrainingLib/gammacalculation.h +++ b/Source/SequenceTrainingLib/gammacalculation.h @@ -11,6 +11,23 @@ #pragma warning (disable: 4127) // conditional expression is constant namespace msra { namespace lattices { + + struct SeqGammarCalParam{ + double amf; + double lmf; + double wp; + double bMMIfactor; + bool sMBRmode; + SeqGammarCalParam() + { + amf = 14.0; + lmf = 14.0; + wp = 0.0; + bMMIfactor = 0.0; + sMBRmode = false; + } + }; + template class GammaCalculation { @@ -19,9 +36,9 @@ namespace msra { namespace lattices { GammaCalculation() : cpumode(false) { initialmark = false; - lmf = 14.0f; // Note that 9 was best for Fisher --these should best be configurable + lmf = 7.0f; // Note that 9 was best for Fisher --these should best be configurable wp = 0.0f; - amf = 14.0f; + amf = 7.0f; boostmmifactor = 0.0f; seqsMBRmode = false; } @@ -30,6 +47,9 @@ namespace msra { namespace lattices { } + //======================================== + // Sec. 1 init functions + //======================================== void init(msra::asr::simplesenonehmm hset, int DeviceId) { m_deviceid = DeviceId; @@ -47,7 +67,21 @@ namespace msra { namespace lattices { } } - + //======================================== + // Sec. 2 set functions + //======================================== + void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam) + { + lmf = (float)gammarParam.lmf; + amf = (float)gammarParam.amf; + wp = (float)gammarParam.wp; + seqsMBRmode = gammarParam.sMBRmode; + boostmmifactor = (float)gammarParam.bMMIfactor; + } + + //======================================== + // Sec. 3 calculation functions + //======================================== void calgammaformb( Microsoft::MSR::CNTK::Matrix& functionValues, std::vector> &lattices, const Microsoft::MSR::CNTK::Matrix& loglikelihood, diff --git a/Source/SequenceTrainingLib/latticeforwardbackward.cpp b/Source/SequenceTrainingLib/latticeforwardbackward.cpp index 4f43bc718..4abb50d3c 100644 --- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp +++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp @@ -442,6 +442,7 @@ template static bool islogzero (FLOAT v) { return v < LOGZERO/2; LogicError("invalid backpointer resulting in state index out of range"); int bp = (int) backpointers(j,t); // save the backpointer before overwriting it (gammas and backpointers are aliases of each other) + //thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js); if (!returnsenoneids) // return binary gammas (for MMI; this mode is compatible with softalignmode) for (size_t i = js; i < je; i++) loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO; diff --git a/Source/SequenceTrainingLib/parallelforwardbackward.cpp b/Source/SequenceTrainingLib/parallelforwardbackward.cpp index 3fb27b59f..bc4baaad9 100644 --- a/Source/SequenceTrainingLib/parallelforwardbackward.cpp +++ b/Source/SequenceTrainingLib/parallelforwardbackward.cpp @@ -743,8 +743,8 @@ namespace msra { namespace lattices { double totalfwscore = 0.0f; if (!parallelstate->emulation) { - - fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size()); + if (verbosity>=2) + fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size()); const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f); const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f); diff --git a/Tests/EndToEndTests/Speech/LSTM/cntk.config b/Tests/EndToEndTests/Speech/LSTM/cntk.config index 292bbe8c1..de01d3d79 100644 --- a/Tests/EndToEndTests/Speech/LSTM/cntk.config +++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config @@ -67,7 +67,7 @@ speechTrain = [ // LSTM cell # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over) - PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, numSteps=1, insertedDim=2) + PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1) PastValue1 = PastValue #PastValue1 = PastValueShift dh = PastValue1(outputDim, output); // hidden state(t-1) diff --git a/Tools/generate_build_info b/Tools/generate_build_info index a155fc84e..62686222e 100755 --- a/Tools/generate_build_info +++ b/Tools/generate_build_info @@ -56,6 +56,9 @@ makebuildinfo() if [ ! -z "$CUB_PATH" ]; then printf "#define _CUB_PATH_ \"%s\"\n" $CUB_PATH >> $target fi + if [ ! -z "$CUDNN_PATH" ]; then + printf "#define _CUDNN_PATH_ \"%s\"\n" $CUDNN_PATH >> $target + fi printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE >> $target printf "#endif\n" >> $target }