Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/separate1bitDataParallelSGD

This commit is contained in:
Amit Agarwal 2016-01-13 22:43:27 -08:00
Родитель b66ea69666 40cbeac00b
Коммит 18528f15b4
54 изменённых файлов: 946 добавлений и 464 удалений

Просмотреть файл

@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug")
CXXFLAGS += -g
LDFLAGS += -rdynamic
CPPFLAGS += -D_DEBUG
CUFLAGS += -O0 -use_fast_math -lineinfo $(GENCODE_FLAGS)
CUFLAGS += -O0 -g -use_fast_math -lineinfo $(GENCODE_FLAGS)
endif
ifeq ("$(BUILDTYPE)","release")

Просмотреть файл

@ -47,7 +47,7 @@ using namespace std;
L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
// TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, numSteps=1, insertedDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"

Просмотреть файл

@ -345,6 +345,9 @@ void PrintBuiltInfo()
#ifdef _CUB_PATH_
fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
#endif
#ifdef _CUDNN_PATH_
fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
#endif
#ifdef _GIT_EXIST
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i
RedirectStdErr(logpath);
}
PrintBuiltInfo();
PrintBuiltInfo(); // this one goes to log file
std::string timestamp = TimeDateStamp();
//dump config info
@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i
// main wrapper that catches C++ exceptions and prints them
// ---------------------------------------------------------------------------
int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions
int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
{
try
{
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
if (argc <= 1)
InvalidArgument("No command-line argument given.");
// detect legacy CNTK configuration
@ -684,6 +688,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st
int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions
{
set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating
_set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
// Note: this does not seem to work--processes with this seem to just hang instead of terminating
__try
{

Просмотреть файл

@ -100,7 +100,7 @@ template <typename ElemType>
void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigParamList& params)
{
std::string name = p_name;
if (EqualInsensitive(name, "CreateModel")) //create a blank model
if (EqualInsensitive(name, "CreateModel")) // create a blank model
{
size_t numFixedParams = 0, numOptionalParams = 0;
if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@ -109,7 +109,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
OverrideModelNameAndSetDefaultModel(cn);
}
if (EqualInsensitive(name, "CreateModelWithName")) //create a blank model
if (EqualInsensitive(name, "CreateModelWithName")) // create a blank model
{
size_t numFixedParams = 1, numOptionalParams = 0;
if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@ -139,6 +139,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams);
auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
#if 1 // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
if (modelFormat == L"cntk_legacy_no_tensorlib")
{
cn->Read<ElemType>(params[1]);
for (auto node : cn->FeatureNodes())
node->SetDims(TensorShape(node->GetNumRows()), 0); // pre-tensorlib InputValues had incorrect tensor dimensions
cn->CompileNetwork();
}
else
#endif
cn->Load<ElemType>(params[1]);
OverrideModelNameAndSetDefaultModel(cn, params[0]);
}
@ -189,8 +199,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
// validate the network before we save it out
ProcessNDLScript(m_netNdlDefault, ndlPassAll, true);
cn->Save(fileName);
cn->SaveEdited(fileName);
}
else if (EqualInsensitive(name, "SaveModel"))
{
@ -209,7 +218,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
// validate and finish the second pass through NDL if any in-line NDL was defined
ProcessNDLScript(netNdl, ndlPassAll, true);
netNdl->cn->Save(fileName);
netNdl->cn->SaveEdited(fileName);
}
else if (EqualInsensitive(name, "SetDefaultModel"))
{

Просмотреть файл

@ -443,6 +443,10 @@ public:
{
modelFormat = L"cntk";
}
else if (EqualInsensitive(value, "cntk_legacy_no_tensorlib")) // model of late 2015 which had a bug in setting InputValue's tensor dimensions
{
modelFormat = L"cntk_legacy_no_tensorlib";
}
else
{
RuntimeError("Invalid optional parameter value %s, valid values are: format=(cntk)", value.c_str());

Просмотреть файл

@ -2423,9 +2423,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu"));
assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);
w = builder.Mean(label, L"Prior");
static_pointer_cast<PreComputedNode<ElemType>>(w)->SideLoadFromMatrix(priorVals);
w->SetParameterUpdateRequired(false);
prior = builder.Mean(label, L"Prior");
static_pointer_cast<PreComputedNode<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
prior->SetParameterUpdateRequired(false);
}
else // pretrained network - need to add output layer, initalize
{
@ -2465,7 +2465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (layerType == "perceptron" || m_needPrior)
{
input = builder.Log(pcNodePtr, L"LogOfPrior");
input = builder.Log(prior, L"LogOfPrior");
//following two lines is needed only if true probability is needed
//output = builder.Softmax(output);

Просмотреть файл

@ -33,6 +33,16 @@ if "%cuda_path%" == "" (
echo #define _CUDA_PATH_ "%cuda_path:\=\\%" >> buildinfo.h$$
)
if not "%cudnn_path%" == "" (
echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$
)
if not "%cub_path%" == "" (
echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$
)
echo #endif >> buildinfo.h$$
::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time)

Просмотреть файл

@ -84,6 +84,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ptrdiff_t tBegin; // first time index in this minibatch. Note that this may be negative of the sequence started before this MB.
size_t tEnd; // end = first frame index after final frame. May be beyond the minibatch if reql sequence is longer than the MB.
bool operator==(const SequenceInfo & other) const { return seqId == other.seqId && s == other.s && tBegin == other.tBegin && tEnd == other.tEnd; }
size_t GetNumTimeSteps() const { return (size_t)(tEnd - tBegin); }
};
// -------------------------------------------------------------------
@ -270,6 +271,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
// find a sequence by its id
const SequenceInfo & FindSequence(UniqueSequenceId seqId) const
{
for (const auto & seqInfo : m_sequences)
if (seqInfo.seqId == seqId)
return seqInfo;
LogicError("FindSequence: Requested sequence (id %u) not found.", (unsigned int) seqId);
}
// -------------------------------------------------------------------
// inquire about gaps or boundaries
// -------------------------------------------------------------------
@ -427,6 +437,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
public: // TODO: make private (currently used from masking and DataFor) ; TODO: rename all members with m_ prefix
size_t timeIdxInSeq; // start frame; SIZE_MAX = all frames in MB
ptrdiff_t m_timeOffset; // this is added to timeIdxInSeq wherever it is used
size_t m_timeRange; // use this to describe a custom range > 1 frame
size_t seqIndex; // parallel-sequence index; SIZE_MAX = all sequences in MB (most common case) --TODO: Bad name, 'sequence' and 'parallel sequence' are two different things
MBLayoutPtr m_pMBLayout; // layout associated with this
bool m_broadcastAllowed; // frame range may be broadcast from outer layout (e.g. a matrix with NULL layout and 1 column is acceptable to this frame range)
@ -434,7 +445,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
public:
// can construct from a single size_t -> a single-frame range
FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}
FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), m_timeRange(1), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}
// or without arguments -> entire minibatch / no frame-range
FrameRange(MBLayoutPtr pMBLayout) : FrameRange(pMBLayout, SIZE_MAX) {}
@ -471,7 +482,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// create a FrameRange with a time offset
// Note: This currently does not work in conjunction with IsAllFrames(). This would be a nice-to have, but tricky w.r.t. out-of-bounds accesses.
// If IsAllFrames() then this will cause out-of-bounds slices.
FrameRange WithTimeOffset(ptrdiff_t offset) const
{
FrameRange ret = *this;
@ -479,6 +490,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return ret;
}
// create a FrameRange with a time range > 1
FrameRange WithTimeRange(size_t range) const
{
FrameRange ret = *this;
if (!IsAllFrames())
ret.m_timeRange = range;
return ret;
}
// dimension we are iterating over; -1 means time dimension; 0 means no layout
int GetIterationDimension() const
{
if (!m_pMBLayout)
return 0;
else
return -1; // TODO: allow user to specify other dimensions
}
class IndexIteration // range for range-based for over sequences
{
size_t m_beginIndex, m_endIndex;
@ -753,7 +782,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (startColumn >= numCols)
LogicError("DataFor: FrameRange specifies a time index that is out of range.");
if (fr.seqIndex == SIZE_MAX)
return std::pair<size_t, size_t>(startColumn, numParallelSequences);
return std::pair<size_t, size_t>(startColumn, numParallelSequences * fr.m_timeRange);
else if (fr.m_timeRange != 1)
LogicError("DataFor: FrameRange only support per-sequence time ranges with tensor slices, not matrix slices.");
else
return std::pair<size_t, size_t>(startColumn + fr.seqIndex, 1);
}
@ -778,7 +809,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TensorSliceWithMBLayoutFor() -- Return tensor slice for a FrameRange with specified number of columns with a given MBLayout
// This implements the logic of interpreting the FrameRange object.
// Unlike the matrix version above, this supports iteration indices other than time.
// TODO: This ^^. Still missing is a field to identify the index.
// TODO: This ^^. FrameRange still missing is a field to identify the index.
// This function happily returns tensor bounds that are out of bounds, assuming caller will do the right thing.
// -----------------------------------------------------------------------
template<class DimensionVector> // e.g. std::vector<size_t> or SmallVector<size_t>
@ -787,6 +819,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const MBLayoutPtr & pMBLayout/*the MB layout of 'data'*/)
{
std::pair<DimensionVector, DimensionVector> result;
typedef decltype(result.first[0]) ElemType;
// this creates a slice for the entire matrix, which we will then narrow down
result.first.resize(shape.size(), 0);
@ -795,8 +828,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// get position of time and sequence index
// These are only valid if we have a layout.
// In the future, the 'timeDim' will be identified by the FrameRange.
int iterDimParam = fr.GetIterationDimension();
size_t iterDim = iterDimParam > 0 ? iterDimParam - 1/*regular dimensions are specified as 1-based*/ : shape.size() + iterDimParam/*-1 for time dimension*/;
size_t sequenceDim = shape.size() - 2; // TODO: In case of multiple time dims, this must be adjusted.
size_t timeDim = sequenceDim + 1; // TODO: Get this from the FrameRange object.
// MBLayout of data and of FrameRange must be identical pointers,
// or in case of broadcasting, respective parent pointers.
@ -819,28 +853,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// but as a reference (e.g. it cannot be resized)
else if (!pMBLayout || fr.IsAllFrames())
{
if (fr.m_timeOffset != 0) // entire minibatch with non-zero offset exceeds bounds on at least one side
LogicError("DataFor: Iteration offset must not be specified for FrameRanges that reference the entire minibatch.");
// TODO: Can we allow this? Semantics would be different, it would crop frames outside.
if (fr.m_timeOffset)
{
if (iterDim >= result.first.size())
LogicError("DataFor: Time offset cannot be applied to tensors that have no time dimension.");
result.first[iterDim] += (ElemType)fr.m_timeOffset; // Note: If we have an offset, this is guaranteed to yield a slice that is out of bounds.
result.second[iterDim] += (ElemType)fr.m_timeOffset;
if (result.first[iterDim] > result.second[iterDim])
LogicError("DataFor: Numeric wraparound. You used a size_t vector where an int vector would be needed.");
}
}
// FrameRange refers to a time slice -> return that
else if (result.second[timeDim] > 1) // (if time dim is broadcasting then always return that one independent of requested index)
else if (result.second[iterDim] > 1) // (if time dim is broadcasting then always return that one independent of requested index)
{
size_t t = fr.timeIdxInSeq + fr.m_timeOffset;
if (t >= result.second[timeDim])
LogicError("DataFor: FrameRange specifies an iteration index that is out of range.");
result.first[timeDim] = t;
result.second[timeDim] = t + 1;
size_t ts = fr.timeIdxInSeq + fr.m_timeOffset;
size_t te = ts + fr.m_timeRange;
result.first[iterDim] = (ElemType)ts;
result.second[iterDim] = (ElemType)te;
}
// sequence index
if (fr.seqIndex != SIZE_MAX/*sequence requested*/ && pMBLayout/*have sequences*/ && result.second[sequenceDim] > 1/*>1 sequence (not broadcasting)*/)
{
size_t s = fr.seqIndex;
if (s >= result.second[sequenceDim])
LogicError("DataFor: FrameRange specifies a paralllel-sequence index that is out of range.");
result.first[sequenceDim] = s;
result.second[sequenceDim] = s + 1;
result.first[sequenceDim] = (ElemType)s;
result.second[sequenceDim] = (ElemType)s + 1;
}
return result;

Просмотреть файл

@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void resize(size_t sz, const T & val) { if (sz < m_size) m_size = sz; else while (m_size < sz) push_back(val); }
void assign(size_t sz, const T & val) { clear(); resize(sz, val); }
template<class ITER>
void append(ITER beg, const ITER & end) { while (beg != end) push_back(*beg++); }
void append(ITER beg, const ITER & end) { while (beg != end) push_back((T)*beg++); } // typecast allows signed/unsigned conversions
template<class ITER>
void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
@ -180,8 +180,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// boilerplate
bool operator==(const TensorShape & other) const { return m_dims == other.m_dims; }
void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
// verify that this refers to a dense matrix (no strides)
void VerifyIsDense() const
{
@ -374,7 +372,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (size() != bounds.first.size() || size() != bounds.second.size())
LogicError("NarrowedTo: Bounds parameter must have same rank as tensor.");
for (size_t k = 0; k < size(); k++)
if (bounds.second[k] <= bounds.first[k] || bounds.second[k] > m_dims[k])
if (bounds.second[k] <= bounds.first[k] || (size_t)bounds.second[k] > m_dims[k])
LogicError("NarrowedTo: Invalid bounds parameter, dimensions must be at least one.");
for (size_t k = 0; k < size(); k++)
{

Просмотреть файл

@ -51,6 +51,7 @@ enum mbrclassdefinition // used to identify definition of class in minimum b
// ===========================================================================
class lattice
{
mutable int verbosity;
struct header_v1_v2
{
size_t numnodes : 32;
@ -567,11 +568,13 @@ private:
std::vector<size_t> backptroffsets; // TODO: we could change this to 'unsigned int' to save some transfer time
std::vector<unsigned short> backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer
size_t numofstates; // per sil hmm
int verbosity;
public:
backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0)
backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0)
{
size_t edgeswithsilence = 0; // (diagnostics only: number of edges with at least one /sil/)
size_t backptrbufsize = 0; // number of entries in buffer for silence backpointer array, used as cursor as we build it
backptroffsets.resize (L.edges.size() + 1); // +1, so that the final entry determines the overall size of the allocated buffer
const size_t silUnitId = hset.gethmmid ("sil");
numofstates = hset.gethmm (silUnitId).getnumstates();
@ -595,15 +598,18 @@ private:
#if 1 // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
if (numsilunits > 1)
{
fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
fprintf (stderr, "alignments: :");
foreach_index (a, aligntokens)
if (verbosity)
{
const auto & unit = aligntokens[a];
const auto & hmm = hset.gethmm (unit.unit);
fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
fprintf(stderr, "alignments: :");
foreach_index(a, aligntokens)
{
const auto & unit = aligntokens[a];
const auto & hmm = hset.gethmm(unit.unit);
fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
}
fprintf(stderr, "\n");
}
fprintf (stderr, "\n");
}
#endif
if (numsilunits > 0)
@ -611,7 +617,8 @@ private:
backptrbufsize += maxsilframes * numofstates;
}
backptroffsets[L.edges.size()] = backptrbufsize; // (TODO: remove if not actually needed)
fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
if (verbosity)
fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
}
// CUDA support
const std::vector<size_t> & getbackptroffsets() const { return backptroffsets; }
@ -1002,6 +1009,10 @@ public:
std::wstring key; // (keep our own name (key) so we can identify ourselves for diagnostics messages)
const wchar_t * getkey() const { return key.c_str(); }
void setverbosity(int veb) const{
verbosity = veb;
}
};
// ===========================================================================
@ -1016,6 +1027,8 @@ class archive
// set of lattice archive files referenced
// Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
std::vector<std::wstring> archivepaths; // [archiveindex] -> archive path
std::wstring prefixPathInToc; // prefix path in a toc; using this to avoid pushd some path before start training
mutable int verbosity;
size_t getarchiveindex (const std::wstring & path) // get index of a path in archivepaths[]; create new entry if needed
{
auto iter = std::find (archivepaths.begin(), archivepaths.end(), path);
@ -1042,7 +1055,8 @@ class archive
{ // need to read the map and establish the mapping
// get the symlist file
const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist";
fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
if (verbosity>0)
fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
std::vector<char> textbuffer;
auto lines = msra::files::fgetfilelines (symlistpath, textbuffer);
// establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
@ -1092,19 +1106,25 @@ class archive
public:
// construct = open the archive
//archive() : currentarchiveindex (SIZE_MAX) {}
void setverbosity(int veb) const
{
verbosity = veb;
}
// test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode)
bool empty() const { return archivepaths.empty(); }
// construct from a list of TOC files
archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap)
archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap, const std::wstring prefixPath=L"")
: currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0)
{
if (tocpaths.empty()) // nothing to read--keep silent
return;
fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str());
size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1;
foreach_index (i, tocpaths)
{
fprintf (stderr, ".");
if ( (i % onepercentage) == 0)
fprintf (stderr, ".");
open (tocpaths[i]);
}
fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size());
@ -1135,7 +1155,11 @@ public:
RuntimeError("open: invalid TOC line (no [): %s", line);
if (q != p)
{
const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
if (!prefixPathInToc.empty())
{
archivepath = prefixPathInToc + L"/" + archivepath;
}
// TODO: should we allow paths relative to TOC file?
archiveindex = getarchiveindex (archivepath);
}
@ -1207,6 +1231,7 @@ public:
fsetpos (f, offset);
// get it
L.fread (f, idmap, spunit);
L.setverbosity(verbosity);
#ifdef HACK_IN_SILENCE // hack to simulate DEL in the lattice
const size_t silunit = getid (modelsymmap, "sil");
const bool addsp = true;

Просмотреть файл

@ -23,10 +23,11 @@ public:
class latticesource
{
const msra::lattices::archive numlattices, denlattices;
int verbosity;
public:
typedef msra::dbn::latticepair latticepair;
latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap)
: numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {}
latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap, std::wstring RootPathInToc)
: numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {}
bool empty() const
{
@ -52,6 +53,12 @@ public:
denlattices.getlattice (key, LP->second, expectedframes); // this loads the lattice from disk, using the existing L.second object
L = LP;
}
void setverbosity(int veb)
{
verbosity = veb;
numlattices.setverbosity(veb); denlattices.setverbosity(veb);
}
};
}}

Просмотреть файл

@ -296,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CreateMatrixIfNull(m_value);
m_value->SetValue(value);
m_hasComputed = true;
SetDims(TensorShape(value.GetNumRows()), value.GetNumCols());
}
public:
bool m_hasComputed;

Просмотреть файл

@ -62,6 +62,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// break cycles
// BUGBUG: This only works if nodes are not shared across networks.
// Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard.
// Or just use weak ptrs.
for (auto & iter : m_nameToNodeMap)
iter.second->DetachInputs();
@ -74,8 +75,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// serialization
// -----------------------------------------------------------------------
// after after editing--network is possibly not validated/compiled
void ComputationNetwork::SaveEdited(const wstring& fileName, const FileOptions fileFormat)
{
if (!IsCompiled())
CompileNetwork();
Save(fileName, fileFormat);
}
void ComputationNetwork::Save(const wstring& fileName, const FileOptions fileFormat) const
{
VerifyIsCompiled("Save");
// In case of parallel training only the main node should we saving the model to prevent
// the parallel training nodes from colliding to write the same file
// TODO: This does not belong here.
@ -182,7 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// load the section of nodes that contain persistable parameters
// This is used for reloading a model without recreating it, e.g. during training.
// TODO: Why not just reload it? Because SGD::Train() holds pointers to the parameters directly? That should be fixed.
template<class ElemType> void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create)
template<class ElemType> void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create)
{
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");
@ -221,47 +231,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
}
template<class ElemType> void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
// deserialize the model
// This does not post-process the model (CompileNetwork()). Use Load() instead.
template<class ElemType> void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
{
ClearNetwork();
File fstream(fileName, fileFormat | FileOptions::fileOptionsRead);
#if 1
LoadPersistableParameters<ElemType>(fstream, true);
#else
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");
// model version
size_t modelVersion = CNTK_MODEL_VERSION_1; //if version info is not there it is version 1
if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BVersion"))
{
fstream >> modelVersion;
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
}
size_t numNodes;
fstream >> numNodes;
// get all node info first
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BNodeList");
for (size_t i = 0; i < numNodes; i++)
{
wstring opName, nodeName;
fstream >> opName >> nodeName;
auto newNode = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
if (!newNode)
{
fprintf(stderr, "Unknown ComputationNode type %ls (node name %ls)\n", opName.c_str(), nodeName.c_str());
InvalidArgument("Invalid node type.");
}
newNode->Load(fstream, modelVersion);
AddNodeToNet(newNode);
}
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
#endif
ReadPersistableParameters<ElemType>(fstream, true);
size_t numNodes = m_nameToNodeMap.size();
@ -277,9 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
vector<wstring> childrenNames;
childrenNames.resize(numChildren);
for (size_t j = 0; j < numChildren; j++)
{
fstream >> childrenNames[j];
}
// TODO: how does the file distinguish float from double?
ComputationNodeBasePtr nodePtr = GetNodeFromName(nodeName);
@ -288,42 +264,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (int j = 0; j < numChildren; j++)
childrenNodes[j] = GetNodeFromName(childrenNames[j], anotherNetwork);
//if (nodePtr->OperationName() == OperationNameOf(RowStackNode))
//{
// allow for variable input nodes
nodePtr->AttachInputs(childrenNodes);
//}
//else
//{
// // fixed input nodes
// // TODO: Use the variable-length AttachInputs() as well. This is a refactoring left-over.
// switch (numChildren)
// {
// case 1:
// nodePtr->AttachInputs(childrenNodes[0]);
// break;
// case 2:
// nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
// break;
// case 3:
// nodePtr->AttachInputs(childrenNodes[0],childrenNodes[1], childrenNodes[2]);
// break;
// case 4:
// nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
// break;
// case 5:
// nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
// break;
// case 6:
// nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
// break;
// default:
// LogicError("Invalid number of children.");
// }
//}
nodePtr->AttachInputs(childrenNodes);
}
}
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERelation");
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BRootNodes");
@ -340,7 +283,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fstream >> nodeName;
m_features.push_back(GetNodeFromName(nodeName));
}
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EFeatureNodes");
}
@ -353,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_labels.push_back(GetNodeFromName(nodeName));
}
}
// BUGBUG: Should this be inside the block?
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELabelNodes");
if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BCriterionNodes") ||
@ -372,13 +314,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
// TODO: this section is defunct
// TODO: this section is defunct, skip over
if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BNodesReqMultiSeqHandling"))
{
fprintf(stderr, "WARNING: Ignoring defunct 'BNodesReqMultiSeqHandling' section in input file.\n");
fstream >> num;
for (size_t i = 0; i < num; i++)
fstream >> nodeName;
fstream >> nodeName; // dummy
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling");
}
@ -415,13 +357,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EPairNodes");
}
}
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERootNodes");
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECN");
// perform all further post-processing, caching, etc.
CompileNetwork();
}
// -----------------------------------------------------------------------
@ -622,9 +560,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//set sequence training parameters, e.g. smoothing weight, frame drop threshhold
template<class ElemType>
void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign)
void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net,
const ComputationNodeBasePtr criterionNode,
const double& hsmoothingWeight,
const double& frameDropThresh,
const bool& doreferencealign,
const double& amf /*= 14.0f*/,
const double& lmf /*= 14.0f*/,
const double& wp /*= 0.0f*/,
const double& bMMIfactor /*= 0.0f*/,
const bool& sMBR /*= false*/
)
{
fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh);
fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n",
amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false");
list<ComputationNodeBasePtr> seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode);
if (seqNodes.size() == 0)
{
@ -638,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node->SetSmoothWeight(hsmoothingWeight);
node->SetFrameDropThresh(frameDropThresh);
node->SetReferenceAlign(doreferencealign);
node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR);
}
}
}
@ -1114,18 +1065,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
template void ComputationNetwork::Load<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
template void ComputationNetwork::LoadPersistableParameters<float>(File & fstream, bool create);
template void ComputationNetwork::Read<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
template void ComputationNetwork::ReadPersistableParameters<float>(File & fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
template /*static*/void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
template void ComputationNetwork::Load<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
template void ComputationNetwork::LoadPersistableParameters<double>(File & fstream, bool create);
template void ComputationNetwork::Read<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
template void ComputationNetwork::ReadPersistableParameters<double>(File & fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
template /*static*/void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
// register ComputationNetwork with the ScriptableObject system
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");

Просмотреть файл

@ -78,24 +78,33 @@ public:
// -----------------------------------------------------------------------
void Save(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary) const;
void SaveEdited(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary);
private:
void SaveToFileImpl(const std::wstring& fileName, const FileOptions fileFormat) const;
public:
template<class ElemType>
void LoadPersistableParameters(File & fstream, bool create);
void ReadPersistableParameters(File & fstream, bool create);
// reload node content only, e.g. used by SGD::Train() when going back to an older model that had better training objective
template<class ElemType>
void ReloadPersistableParameters(const std::wstring& fileName)
void RereadPersistableParameters(const std::wstring& fileName)
{
File fstream(fileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
LoadPersistableParameters<ElemType>(fstream, false);
ReadPersistableParameters<ElemType>(fstream, false);
}
// design BUGBUG: binary files do not know whether they are float or double.
// TODO: modify file format to know this; then eliminate the <ElemType> dependency (and in some future, allow nodes to be different)
template<class ElemType>
void Read(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
template<class ElemType>
void Load(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr)
{
Read<ElemType>(fileName, fileFormat, bAllowNoCriterionNode, anotherNetwork);
// perform all further post-processing, caching, etc.
CompileNetwork();
}
// static helper to instantiate a network from a file
template<class ElemType>
@ -159,9 +168,11 @@ public:
private:
void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
void MarkValueNonSharableNodes();
private:
void DetermineSetOfAllRoots();
void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
bool IsCompiled() const { return m_isCompiled; }
void VerifyIsCompiled(const char * where) const;
//bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode);
public:
@ -411,8 +422,20 @@ public:
template<class ElemType>
static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
template<class ElemType>
static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
static void SetSeqParam(ComputationNetworkPtr net,
const ComputationNodeBasePtr criterionNode,
const double& hsmoothingWeight,
const double& frameDropThresh,
const bool& doreferencealign,
const double& amf=14.0f,
const double& lmf=14.0f,
const double& wp=0.0f,
const double& bMMIfactor=0.0f,
const bool& sMBR=false);
static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);
// -----------------------------------------------------------------------

Просмотреть файл

@ -30,6 +30,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
std::wstring toName,
const CopyNodeFlags flags)
{
InvalidateCompiledNetwork();
if (toName == L"")
toName = fromName;
@ -50,11 +52,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
{
//node already exists
// node already exists
pToNode = GetNodeFromName(toName);
//same node. no copy needed
// same node. no copy needed
if (pFromNode == pToNode)
LogicError("CopyNode: You are copying the node to the same network with same node name.");
else
@ -69,6 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const std::wstring fromName, std::wstring toNamePrefix,
const CopyNodeFlags flags)
{
InvalidateCompiledNetwork();
if (!(flags & CopyNodeFlags::copyNodeValue))
LogicError("CopySubTree: you cannot copy a tree without copying the node values.");
@ -103,7 +106,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// nodeNameNew - new node name
void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew)
{
// so that renamed node will not be referenced
InvalidateCompiledNetwork();
ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig);
@ -128,7 +130,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ComputationNetwork::DeleteNode(const std::wstring & nodeName)
{
// so that deleted node will not be referenced
InvalidateCompiledNetwork();
ComputationNodeBasePtr nodeToDelete = GetNodeFromName(nodeName);
@ -172,6 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// need to update all the mappings as well childrens
void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode)
{
InvalidateCompiledNetwork();
ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName);
if (oldNode->OperationName() != newNode->OperationName())
InvalidArgument("newNode must have the same type as the old node.");
@ -204,6 +207,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// need to update those nodes who use oldNode as their child
void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
{
InvalidateCompiledNetwork();
ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName);
// change the input of those nodes whose child is oldNode
@ -223,6 +228,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
{
InvalidateCompiledNetwork();
// Checks if the node is a criterion node.
int index = -1;
for (int i = 0; i < m_finalCriteria.size(); ++i)
@ -251,6 +258,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode)
{
InvalidateCompiledNetwork();
wstring nodeName = featureNode->NodeName();
if (NodeNameExists(nodeName))
RuntimeError("AddFeatureNode: feature node already exists.");
@ -261,12 +270,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// We only remove the node, not delete it.
void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
{
InvalidateCompiledNetwork();
wstring nodeName = featureNode->NodeName();
if (!NodeNameExists(nodeName))
RuntimeError("RemoveFeatureNode: feature node does not exist.");
InvalidateCompiledNetwork();
// Removes links.
for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter)
{

Просмотреть файл

@ -10,11 +10,13 @@
#include "ComputationNode.h"
#include "ComputationNetwork.h"
#include "RecurrentNodes.h"
#include "InputAndParamNodes.h"
#include <string>
#include <vector>
#include <list>
#include <set>
#include <algorithm>
#include <map>
using namespace std;
@ -365,7 +367,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// verify that network has undergone CompileNetwork()
void ComputationNetwork::VerifyIsCompiled(const char * where) const
{
if (!m_isCompiled)
if (!IsCompiled())
LogicError("%s: A compiled network was expected.", where);
}
@ -712,6 +714,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// memory allocation
// -----------------------------------------------------------------------
// mark nodes that are purely induced by parameters as non-sharable and create space for value if null
void ComputationNetwork::MarkValueNonSharableNodes()
{
const auto & nodes = GetEvalOrder(nullptr);
std::map<wstring, bool> allLeafDescendentsAreParameters;
std::list<ComputationNodeBasePtr> allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter));
// note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not
for (auto& node : nodes)
{
auto children = node->GetInputs();
wstring myname = node->NodeName();
bool allParameters = true;
if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already
{
for (auto child : children)
{
wstring ChildName = child->NodeName();
if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
{
// not found, means it is a leaf node (we are at eval order )
assert(child->IsLeaf() || child->IsPartOfLoop());
if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
{
allLeafDescendentsAreParameters[ChildName] = true;
}
else
{
allParameters = false;
allLeafDescendentsAreParameters[ChildName] = false;
break;
}
}
else
{
if (allLeafDescendentsAreParameters[ChildName] == false)
{
allParameters = false;
break;
}
}
}
allLeafDescendentsAreParameters[myname] = allParameters;
if (allParameters)
{
node->MarkValueNonSharable();
}
else
{
node->MarkValueSharable();
}
}
}
}
// this function will need to be called before actual validation and execution to
// predetermine how to share matrices to reduce memory usage.
@ -726,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
VerifyIsCompiled("AllocateAllMatrices");
// Due to special topology, if a node is solely induced by parameters, its function value should not be shared
MarkValueNonSharableNodes();
bool performingBackPropagation = (trainRootNode != nullptr);
// Create a composite Eval order with the specfied nodes as roots
// Create a composite Eval order with the specified nodes as roots
std::vector<ComputationNodeBasePtr> forwardPropRoots;
forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());

Просмотреть файл

@ -136,7 +136,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
typedef std::shared_ptr<INodeState> NodeStatePtr;
virtual NodeStatePtr ExportState() = 0;
virtual void ImportState(NodeStatePtr && state) = 0;
virtual void ImportState(const NodeStatePtr & state) = 0;
};
typedef IStatefulNode::NodeStatePtr NodeStatePtr;
@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
friend class ComputationNetwork;
ComputationNetworkOwnedNodeState() :
m_needsGradient(false)
m_needsGradient(false), m_valueSharable(true)
{
PurgeStateForFormingRecurrentLoops();
m_isPartOfLoop = false;
@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool IsPartOfLoop() const { return m_isPartOfLoop; }
virtual void MarkValueNonSharable(){ m_valueSharable = false; }
virtual void MarkValueSharable() { m_valueSharable = true; }
bool isValueSharable() const { return m_valueSharable; }
protected: // TODO: should be fully encapsulated here
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
bool m_valueSharable; // a flag is needed for memory share.
// If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters),
// it will never be released to memory pool
private:
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
m_parameterUpdateRequired(false), m_gradientInitialized(false),
m_nodeName(name == L"" ? CreateUniqNodeName() : name),
m_numRows(0), m_numCols(0)
m_numRows(0), m_numCols(0)
{ }
virtual ~ComputationNodeBase(){}
@ -348,9 +355,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const TensorShape & GetSampleLayout() const { return m_sampleLayout; }
bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; } // meaning does it have a layout that is not just a vector
TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object
protected:
size_t DetermineElementwiseTensorRank() const; // determine tensor rank when considering all inputs with padding
TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object
TensorShape GetTensorSliceFor(size_t rank, const FrameRange & fr) const; // form tensor shape of the slice referenced by FrameRange
public:
// access to element(0,0) without having to type-cast
@ -455,6 +462,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
}
protected:
public: // ...the following should be protected, but nodes inquire about their children, requiring public access
@ -537,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
bool IsOutputNeededDuringBackprop() const
{
return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ;
}
const size_t GetNumInputs() const { return m_inputs.size(); }
@ -769,6 +777,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool m_parameterUpdateRequired; // update parameters? Only used for LearnableParameters. --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0
bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop
};
typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
@ -902,7 +911,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//don't release matrices that need to be used in the gradient computation
virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
{
if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE))
if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
ReleaseMatrixToPool(m_value, matrixPool);
}
@ -931,7 +940,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Release the Value matrix only if the output value is needed during backprop
// since in the case it isn't used, we release it during forward prop itself
if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE)
if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
ReleaseMatrixToPool(m_value, matrixPool);
}
}
@ -1317,6 +1326,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CreateMatrixIfNull(m_gradient);
}
void MarkValueNonSharable() override
{
m_valueSharable = false;
CreateMatrixIfNull(m_value);
}
protected:
// this function is used to create matrices for those needed before matrix pool is available
@ -1532,7 +1548,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */ \
protected: \
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
using Base::m_deviceId; using Base::shared_from_this; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; \
using Base::GetNumRows; using Base::GetNumCols; using Base::GetTensorShape; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \

Просмотреть файл

@ -813,9 +813,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void SetEvalMode(bool bnEvalMode)
{
m_eval = bnEvalMode;
m_eval = bnEvalMode;
}
private:
struct VersionInfo
{

Просмотреть файл

@ -41,6 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base(deviceId, name)
{
m_parameterUpdateRequired = true;
this->m_valueSharable = false;
SetDims(TensorShape(), 0);
}
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
@ -48,6 +49,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
m_parameterUpdateRequired = true;
CreateMatrixIfNull(m_value);
this->m_valueSharable = false;
// for now we split off the trailing dimension into the matrix column dimension
// TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
auto dims = shape.GetDims();
@ -197,6 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrintNodeValuesToFile(printValues, fstream);
}
};
#if 0
@ -261,6 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SetDims(sampleLayout, 0);
UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
m_parameterUpdateRequired = false;
this->m_valueSharable = false;
}
protected:
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :

Просмотреть файл

@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
//static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);

Просмотреть файл

@ -9,6 +9,7 @@
#include "Matrix.h"
#include "TensorShape.h"
#include "ComputationNode.h"
#include "Sequences.h"
#include <unordered_set>
#include <map>
@ -26,7 +27,7 @@
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// ShiftNode (input, fromOffset, boundaryValue, dim=-1, numSteps=1, insertDim=0) -- delay and rolling window
// ShiftNode (input, fromOffset, boundaryValue, dim=-1) -- delay and rolling window
//
// This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset).
// E.g. for fromOffset=-1, this gives the past value.
@ -34,36 +35,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//
// This node can be used in a recurrent loop. This requires special handling by the ComputationNetwork,
// for both execution (sequential execution) and creation (avoiding circular references).
// TODO: When outside a recurrent loop and used with frame randomization, this will communicate to the reader
// that additional frames are needed, which will then return a frame range. TODO: This will not match
// the labels, which are still 1 frame. Think through which dimension this should go in.
//
// Values shifted in from beyond sequence boundaries will be copied from boundaryValue.
// Normally, this is a scalar Constant(). However, it can be any node, which will be indexed from the end
// (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement
// sequence-to-sequence models. Broadcasting is supported, so it can be e.g. a single output-dimension vector
// (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement the basic
// sequence-to-sequence model. Broadcasting is supported, so it can be e.g. a single output-dimension vector
// applied to all sequences.
//
// To delay (past value), use negative fromOffset. To access future value, use positive fromOffset.
//
// To pull in multiple offsets, use offsetRange>1. This will pull in offsetRange consecutive offsets starting
// with fromOffset. This implements a rolling window. A new dimension will be inserted at multiOffsetDim
// (default 0 means after the last sample dimension). Special considerations:
// - If the boundaryValue is not wide enough, the sequence will be dropped (e.g. if you pull in 5 history frames,
// but the sequence in boundaryValue only has 4 samples).
// - If you feed back such an expanded output into this node in a loop, you get an inconsistency
// and will eventually fail. You must pull the dimensions apart.
// - If the current time step (offset 0) is included in the range (e.g. fromOffset=-1, offsetRange=3) then
// this node cannot participate in a recurrence.
//
// By default, this shifts over the time dimension, but you can choose to shift over any
// sample tensor dimension instead using 'dim' (-1 stands for time). This will only work, however,
// when all involved nodes are implemented using the tensor library. Nodes implemented using
// Matrix slices can only support iterating over time.
//
// If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary
// are dropped). This will initially not be implemented for the time dimension (as it would require
// change of MBLayout).
// -----------------------------------------------------------------------
template<class ElemType>
@ -74,24 +58,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
public:
enum BoundaryMode : int // how to fill frames at boundaries
{
reachAcross = -1, // go across the boundary: use boundaryValue. This is for recurrence.
duplicate = 0, // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
trim = 1 // drop frames. Non-recurrent mode only.
reachAcross = -1, // go across the boundary: use boundaryValue
duplicate = 0 // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
};
ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimension, size_t numSteps, int insertedDimParam) :
Base(deviceId, name), m_fromOffset(fromOffset), m_numSteps(numSteps),
ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimParam) :
Base(deviceId, name), m_fromOffset(fromOffset),
m_boundaryMode(boundaryMode),
m_shiftDimension(shiftDimension), m_insertedDimParam(insertedDimParam),
m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/)
m_shiftDimParam(shiftDimParam),
m_shiftDim(SIZE_MAX),
m_state(deviceId)
{
CreateMatrixIfNull(m_value);
SetDims(TensorShape(), 0); // empty for now
}
ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) :
ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1, 1, 0)
ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1)
{ }
ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) :
ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"), configp->Get(L"numSteps"), configp->Get(L"insertedDim"))
ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"))
{
// We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference.
// Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@ -111,19 +95,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Save(File& fstream) const
{
Base::Save(fstream);
fstream << m_fromOffset << m_numSteps << m_boundaryMode << m_shiftDimension << m_insertedDimParam;
fstream << m_fromOffset << m_boundaryMode << m_shiftDimParam;
}
virtual void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
fstream >> m_fromOffset >> m_numSteps >> m_boundaryMode >> m_shiftDimension >> m_insertedDimParam;
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
assert(inputIndex == 0); inputIndex;
fr;
fstream >> m_fromOffset >> m_boundaryMode >> m_shiftDimParam;
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -133,6 +111,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::BeginForwardProp();
// TODO: If we have a truncated-BPTT state then verify that the sequence indices match with m_state->m_sequences, and the tensor dimensions.
// in case of trimming, narrow the layout
// We actually do not drop content, only reduce the range of sequences.
// This is meant to optimize for the case where we have multiple sequences concatenated while trimming a small amount only.
@ -142,34 +122,216 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::EndForwardProp();
// In BPTT, we carry over left-to-right state across minibatches.
// In truncated BPTT, we carry over left-to-right state across minibatches.
// The necessary frames are stored in m_state->m_delayedValue.
// Only if layout has anything exceeding the MB.
if (GetMBLayout()->HasSequenceBeyondEnd()) // only if layout has any sequence that has ends beyond this minibatch
{
}
else
m_state.clear();
}
private:
typedef std::pair<SmallVector<int>, SmallVector<int>> SliceBounds; // slice bounds for dimension k are [first[k], second[k]) (think STL begin/end)
TensorView<ElemType> DataTensorFor(Matrix<ElemType> & data, TensorShape shape/*original shape of 'data'*/, SliceBounds slice)
{
shape.NarrowTo(slice);
return TensorView<ElemType>(data, shape);
}
// This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop.
// helper to shift dimension 'm_shiftDim' of SliceBounds by an offset (a common operation below)
SliceBounds ShiftDim(const SliceBounds & in, int shiftBy)
{
SliceBounds result = in;
result.first [m_shiftDim] += shiftBy;
result.second[m_shiftDim] += shiftBy;
return result;
}
static SmallVector<int> ToIntDims(const TensorShape & shape)
{
SmallVector<int> dimsSigned;
dimsSigned.append(shape.GetDims().begin(), shape.GetDims().end()); // we need the bounds as signed integers as they may shift into negative ranges
return dimsSigned;
}
// determine shapes and slices to move
// This is used for both forward and backprop.
// 'In' below refers to Input(0) where 'Out' refers to the output of *this.
void DetermineSlices(size_t rank, const FrameRange & fr,
TensorShape & inShape, TensorShape & outShape, // our MB's shape
SliceBounds & inSliceLogical, SliceBounds & outSliceLogical) // the logical ranges to shift
{
// get the slice bounds for the given FrameRange
outShape = GetTensorShape(rank); // describes the full tensor including sequence and time dimensions
inShape = Input(0)->GetTensorShape(rank);
// determine the logical in and out slices
// This may now have bounds that fall outside, which we need to split off next.
outSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr, GetMBLayout());
inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout()); // apply the offset
}
// determine stripes to move w.r.t. main storage and from/to state
// For efficiency:
// - this function assumes that the return values have been freshly constructed (it won't reset them)
// - it may return a slice with end < begin which indicates an empty slice
void PartitionSlices(const SliceBounds & inSliceLogical, const SliceBounds & outSliceLogical, // the move we want to make
int T, // our actual size
SliceBounds & inSliceMain, SliceBounds & outSliceMain, // the part that goes main-to-main
SliceBounds & inSliceState, SliceBounds & outSliceState) // the part that goes from/to state
{
inSliceMain = inSliceLogical;
outSliceMain = outSliceLogical;
if (inSliceMain.first[m_shiftDim] < 0)
{
assert(inSliceMain.second[m_shiftDim] < T);
if (!m_state.empty()) // truncated BPTT case
{
// determine range that lives in state
SliceBounds inSliceOutside = inSliceMain; // beginning falls to the left of the MB
if (inSliceOutside.second[m_shiftDim] > 0)
inSliceOutside.second[m_shiftDim] = 0; // trim end; e.g. [-2,97) -> [-2,0), but [-2,-1) remains
// now inSliceOutside represents only the region that falls outside
// map to dimensions of our saved state
SliceBounds inSliceState = ShiftDim(inSliceOutside, m_state.m_shape[m_shiftDim]);
// E.g. for offset = -4, m_state will be 4 elements, so [-2,0) -> [2,4), and [-2,-1) -> [2,3)
// map to target dimensions
SliceBounds outSliceState = ShiftDim(inSliceOutside, -m_fromOffset);
assert(inSliceState == outSliceState); // (when we fall out on the left, both must be the same)
}
// else: no truncated BPTT means we must have a proper boundary. So don't write those values here, they will be initialized with boundary values below.
// and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
outSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim];
inSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim];
assert(inSliceMain.first[m_shiftDim] == 0);
}
else if (inSliceMain.second[m_shiftDim] > T)
{
if (!m_state.empty())
{
// determine range to get from state
SliceBounds inSliceOutside = inSliceMain;
if (inSliceOutside.first[m_shiftDim] < T)
inSliceOutside.first[m_shiftDim] = T; // trim end; e.g. [2,102) -> [100,102), but [101,102) remains
// now inSliceOutside is where we should copy from, with indices completely out of bounds
// map to dimensions of our saved state
SliceBounds inSliceState = ShiftDim(inSliceOutside, -T);
// E.g. for offset = 4, m_state will be 4 elements, so [100,102) -> [0,2), and [101,102) -> [1,2)
// map to target dimensions
SliceBounds outSliceState = ShiftDim(inSliceOutside, T - m_fromOffset);
// E.g. [0,2) -> [96,98), and [1,2) -> [97,98)
}
// and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
outSliceMain.first[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
inSliceMain.second[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
assert(inSliceMain.second[m_shiftDim] == T);
}
}
public:
virtual void ForwardProp(const FrameRange & fr) override
{
if (fr.GetIterationDimension() != m_shiftDimParam)
LogicError("ShiftNode::ForwardProp(): FrameRange not iterating over user-specified dimension.");
// for debugging, invalidate the output region, so we will catch if we missed to update something
#ifdef _DEBUG
ValueFor(fr).Invalidate();
#endif
// STEP 1: whole-sale copy a shifted version of the input to the output
// - consider the saved parts from the last minibatch as part of the input at dimensions beyond the bounds
// - ignore boundary conditions for now
// - ignore boundary conditions at this point (will be fixed subsequently)
// This will copy a little too much in case of multiple concatenated sequences within a single parallel sequence.
// get the tensors without shift
// get the logical ranges we want to shift
TensorShape inShape, outShape; // expanded tensor shapes of input and output
SliceBounds inSliceLogical, outSliceLogical; // the logical ranges to shift
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
DetermineSlices(rank, fr, inShape, outShape, inSliceLogical, outSliceLogical);
// shift the dimension in the input
// now copy the two stripes--one that is main-to-main, and one that pulls in data from previous state (truncated BPTT only)
// This correctly handles if input is a tensor with strides. This is currently not the case, but may be if we support in-place.
SliceBounds inSliceMain, outSliceMain; // main-to-main
SliceBounds inSliceState, outSliceState; // from state
PartitionSlices(inSliceLogical, outSliceLogical, outShape[m_shiftDim], inSliceMain, outSliceMain, inSliceState, outSliceState);
if (!inSliceState.first.empty() && inSliceState.second[m_shiftDim] > inSliceState.first[m_shiftDim])
{
// Note: If all sequences begin at the start of the range, this would copy invalid values which would be overwrittten below.
// This is prevented in that m_state will be set to empty in the previous MB if all sequences ended, which will in turn return an empty slice.
auto from = DataTensorFor(m_state.m_delayedValue, m_state.m_shape, inSliceState);
auto to = DataTensorFor(Value(), outShape, outSliceState);
to.AssignCopyOf(from);
}
if (inSliceMain.second[m_shiftDim] > inSliceMain.first[m_shiftDim])
{
auto from = DataTensorFor(Input(0)->Value(), inShape, inSliceMain);
auto to = DataTensorFor( Value(), outShape, outSliceMain);
to.AssignCopyOf(from);
}
// We have now pulled anything from within the logical bounds.
// Any frame that pulls from outside contains invalid values (either not initialized or copied from incorrect source), which must be fixed next.
// STEP 2: fix up the boundary conditions
// - fill in xxx
// - fill in all frames that are too close to boundary and must be filled from context (recurrent) or by replication (non-recurrent only)
// turn selected frame and shifted frame into a tensor
if (fr.IsAllFrames() || GetMBLayout()->IsBeyondStartOrEnd(fr.WithTimeOffset(m_fromOffset))) // short-cut test whether there is anything to do
{
auto ts = outSliceLogical.first[m_shiftDim];
auto te = outSliceLogical.second[m_shiftDim];
//size_t sequenceDim = outShape.size() - 2; // TODO: In case of multiple time dims, this must be adjusted. Code dup from TensorSliceWithMBLayoutFor(). Encapsulate this.
// iterate over all sequences in this batch and handle all that overlap with the target region
for (const auto & seq : GetMBLayout()->GetAllSequences())
{
if (seq.tEnd <= ts || seq.tBegin >= te) // no overlap--skip
continue;
// copy all that's in range
// get tensor to fill in. This may be out of bounds, and may only partially overlap with [ts,te)
auto seqLen = abs(m_fromOffset);
auto seqBegin = m_fromOffset < 0 ? seq.tBegin : seq.tBegin + seq.GetNumTimeSteps() - seqLen; // e.g. m_fromOffset = -4 -> [0,4) , +4 -> [Len-4,Len)
auto outSliceFill = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr.WithTimeOffset(seqBegin).WithTimeRange(seqLen).Sequence(seq.s), GetMBLayout());
// fix up all that is not
// get tensor to fill from
// We fill either from the provided boundary node or from ourselves (BoundaryMode::duplicate = clamp).
bool clamp = m_boundaryMode == BoundaryMode::duplicate;
ComputationNodeBasePtr boundaryNode = clamp ? shared_from_this() : Input(0);
auto boundaryShape = boundaryNode->GetTensorShape(rank);
auto fromSeq = clamp ?
seq.s :
boundaryNode->HasMBLayout() ?
boundaryNode->GetMBLayout()->FindSequence(seq.seqId).seqId :
SIZE_MAX;
auto fromBegin = 0;
auto boundarySliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(boundaryShape), fr.WithTimeOffset(fromBegin).WithTimeRange(seqLen).Sequence(fromSeq), GetMBLayout());
boundarySliceLogical;
//inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout()); // apply the offset
// clip against [ts,te)
// copy
sin(1);
}
}
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
// To allow for bulk gradient computation, we will clear out any gradient that should not be propagated.
// We do that directly to our incoming output gradient. This is OK because we own this, and it is no longer used after this operation
// (it is invalid to call BackpropTo() multiple times since it adds to the outgoing Input() gradient).
assert(inputIndex == 0); inputIndex;
fr;
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -177,46 +339,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(m_inputs.size() == 2);
ComputationNodeBase::Validate(isFinalValidationPass);
if (isFinalValidationPass)
sin(1.0f);
// MBLayout is just inherited
m_pMBLayout = Input(0)->GetMBLayout();
if (isFinalValidationPass && !m_pMBLayout)
InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());
// determine final sample layout
auto inputSampleLayout = Input(0)->GetSampleLayout();
auto inputDims = inputSampleLayout.GetDims();
if (m_insertedDimParam < 0)
InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.",
NodeName().c_str(), OperationName().c_str(), m_insertedDimParam);
m_insertExpandShapeAt = m_numSteps > 1 ? 0 : (m_insertedDimParam > 0 ? m_insertedDimParam - 1 : inputDims.size());
if (m_insertExpandShapeAt > inputDims.size())
if (isFinalValidationPass)
InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].",
NodeName().c_str(), OperationName().c_str(), m_insertedDimParam, string(inputSampleLayout).c_str());
else
m_insertExpandShapeAt = inputDims.size(); // this may be an error, but we want to catch that only in the final pass
SmallVector<size_t> dims;
if (m_numSteps > 1 && inputDims.size() + 1 > dims.capacity())
InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?",
NodeName().c_str(), OperationName().c_str());
dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt);
if (m_numSteps > 1) // insert the new dimension if we expand into more than one step
dims.push_back(m_numSteps);
dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end());
auto sampleLayout = TensorShape(dims);
// as is the sample layout
SetDims(Input(0));
SetDims(sampleLayout, 0);
// determine the dimension that is to be shifted (convert user-specified as a zero-based index)
if (isFinalValidationPass)
{
size_t rank = DetermineElementwiseTensorRank();
auto valueShape = GetTensorShape(rank); // bounds of the Value()
m_shiftDim = m_shiftDimParam > 0 ? m_shiftDimParam - 1/*regular dimensions are specified as 1-based*/ : valueShape.size() + m_shiftDimParam/*-1 for time dimension*/;
}
}
// special interface for use by loop detection
virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override
{
if (m_boundaryMode != BoundaryMode::reachAcross)
if (m_boundaryMode != BoundaryMode::reachAcross) // duplicating boundary frames cannot be done with recurrence
return 0;
else if (m_fromOffset + (int)m_numSteps <= 0)
else if (m_fromOffset < 0)
return +1;
else if (m_fromOffset > 0)
return -1;
@ -231,48 +376,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
auto node = dynamic_pointer_cast<ShiftNode<ElemType>>(nodeP);
node->m_fromOffset = m_fromOffset;
node->m_numSteps = m_numSteps;
node->m_boundaryMode = m_boundaryMode;
node->m_shiftDimension = m_shiftDimension;
node->m_insertedDimParam = m_insertedDimParam;
node->m_insertExpandShapeAt = m_insertExpandShapeAt;
node->m_shiftDimParam = m_shiftDimParam;
node->m_shiftDim = m_shiftDim;
node->m_state = m_state;
}
}
class ShiftNodeState : public INodeState
{
Matrix<ElemType> m_delayedValue; // saves the activation of the previous step that this node points to
vector<MBLayout::SequenceInfo> m_delayedSequences; // and associated sequence info. This is only used for consistency checking (it must match).
public:
Matrix<ElemType> m_delayedValue; // saves the activation of the previous step that this node points to
TensorShape m_shape; // tensor shape that describes m_delayedValue
vector<MBLayout::SequenceInfo> m_delayedSequences; // and associated sequence info. This is only used for consistency checking (it must match).
ShiftNodeState(DEVICEID_TYPE deviceId) : m_delayedValue(deviceId) { }
bool empty() const { return m_delayedSequences.empty(); }
void clear() { m_delayedValue.Resize(0, 0); m_shape = TensorShape(); m_delayedSequences.clear(); }
};
typedef std::shared_ptr<ShiftNodeState> ShiftNodeStatePtr;
// state export/import
// This is done with a shared_ptr. The moment state is exported, the internal state is cleared; ownership is transferred to the exporting entity.
// This way, the next invocation does not overwrite the exported state, but is required to create a new one if needed.
// On the other hand, once imported, the state object is owned by the node and will be overwritten with the next state.
virtual NodeStatePtr ExportState() { return std::move(m_state); }
virtual void ImportState(NodeStatePtr && state) override
// This is done with a shared_ptr. The current state is exported, the internal state is cleared.
// Ownership of members is logically transferred to the exporting entity.
// Physically, however, since we often transfer between CPU and GPU, activation data is merely copied,
// and the GPU or CPU object resized to (0,0) without giving up the memory.
virtual NodeStatePtr ExportState() // TODO: can we instead pass the shared_ptr object in? So we don't need to create a new one all the time? Or should we still take ownership of the ptr?
{
m_state = dynamic_pointer_cast<ShiftNodeState>(state);
if (state && !m_state)
auto state = make_shared<ShiftNodeState>(CPUDEVICE);
state->m_delayedValue.SetValue(m_state.m_delayedValue); // note: this will transfer from GPU to CPU
m_state.m_delayedValue.Resize(0, 0);
state->m_shape = std::move(m_state.m_shape);
state->m_delayedSequences = std::move(m_state.m_delayedSequences);
return state;
}
virtual void ImportState(const NodeStatePtr & statep) override
{
ShiftNodeStatePtr state = dynamic_pointer_cast<ShiftNodeState>(statep);
if (!state)
LogicError("ImportState: Wrong state object passed (wrong type).");
m_state.m_delayedValue.SetValue(state->m_delayedValue); // note: this will transfer from CPU to GPU
state->m_delayedValue.Resize(0, 0);
m_state.m_shape = std::move(state->m_shape);
m_state.m_delayedSequences = std::move(state->m_delayedSequences);
}
protected:
// parameters remembered from construction
int m_fromOffset; // offset to pull from
int m_numSteps; // offset range
BoundaryMode m_boundaryMode; // how to fill at the boundary (reach across, duplicate, or trim)
int m_shiftDimension; // dimension to shift (default: time)
int m_insertedDimParam; // in case of multiple steps, this is where a new dimension will be inserted
int m_fromOffset; // offset to pull from
BoundaryMode m_boundaryMode; // how to fill at the boundary (reach across or duplicate)
int m_shiftDimParam; // dimension to shift (default: time)
// derived params set up in Validate()
size_t m_insertExpandShapeAt; // at which dimension to insert (internal 0-based index)
size_t m_shiftDim; // m_shiftDimParam matched to the real tensor index
ShiftNodeStatePtr m_state; // saves the activation of the previous step that this node points to
ShiftNodeState m_state; // state that is carried over across evaluations
// Note: The version held by this node lives in the GPU, whereas the versions being exported carry CPU-side copies
function<void()> m_attachInputsFn; // for late expansion of inputs (scripting)
function<void()> m_attachInputsFn; // for late expansion of inputs (scripting)
};
// -----------------------------------------------------------------------
@ -333,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window)
// - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window)
// - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed)
// - support for Yongqiangs sub-minibatching with BPTT (export/import state)
// - support for Yongqiangs sub-minibatching with truncated BPTT (export/import state)
// - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch)
// -----------------------------------------------------------------------
@ -486,7 +644,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void EndForwardProp() override // called after last iteration step of ForwardProp()
{
// In BPTT, we carry over left-to-right state across minibatches.
// In truncated BPTT, we carry over left-to-right state across minibatches.
// It is kept in m_delayedValue, m_delayedActivationMBLayout.
// This could be optimized as follows:
// - only keep the required number of frames (m_timeStep)
@ -620,27 +778,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (dir == -1) // we look into past
{
#if 0
bool allAtBoundary = true;
// if the current last frames are all sentence end or no feature , there is no need to carry on state info
if (m_pMBLayout->Is(FrameRange(nT-1), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
{
for (size_t u = 0; u < nU; u++)
{
if (!m_pMBLayout->Is(FrameRange(nT - 1).Sequence(u), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
{
allAtBoundary = false;
break;
}
}
}
else
{
allAtBoundary = false;
}
if (allAtBoundary)
#endif
if (!m_pMBLayout->HasSequenceBeyondEnd()) // only need to export state if anything crosses the MB boundary
{
auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
@ -655,26 +792,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
pExportedState = pState;
}
}
if (dir == 1) // we look into future
else if (dir == 1) // we look into future
{
#if 0
// TODO: check whether all at boundary and don't carry state if it is the case
size_t nT = m_pMBLayout->GetNumTimeSteps();
size_t nU = m_pMBLayout->GetNumParallelSequences();
bool allAtBoundary = true;
if (m_pMBLayout->Is(FrameRange(nullptr, 0), MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart))
{
for (size_t u = 0; u < nU; u++)
{
if (!m_pMBLayout->Is(FrameRange(nullptr, 0).Sequence(u), MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature))
{
allAtBoundary = false;
break;
}
}
}
if (allAtBoundary)
#endif
if (!m_pMBLayout->HasSequenceBeyondBegin()) // only need to export state if anything crosses the MB boundary
{
auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
@ -689,19 +808,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
pExportedState = pState;
}
}
if (dir != -1 && dir != 1)
else
{
RuntimeError("Unrecognized direction in DelayedValueNodeBase");
LogicError("Unrecognized direction in DelayedValueNodeBase");
}
return pExportedState;
}
virtual void /*IStatefulNode::*/ImportState(NodeStatePtr && pImportedState) override
virtual void /*IStatefulNode::*/ImportState(const NodeStatePtr & pImportedState) override
{
DelayedNodeStatePtr pState = dynamic_pointer_cast<DelayedValueNodeState<ElemType>> (pImportedState);
if (!pState)
RuntimeError("Expecting DelayValueNodeState after down casting");
LogicError("Expecting DelayValueNodeState after downcasting");
pState->ExportDelayedMBLayout(m_delayedActivationMBLayout); // pstate copy to m_delayedActivationMBLayout
if (pState->IsEmpty())
@ -715,18 +834,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int dir = direction;
if (dir == -1) // looking backward
{
m_delayedValue.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
}
if (dir == 1)
{
//m_delayedValue.CopyColumnsStrided(delayedActivation, nU, 1, nT);
else if (dir == 1)
m_delayedValue.SetColumnSlice(delayedActivation, 0, nU);
}
if (dir != -1 && dir == 1)
{// it is really a compile error ?
RuntimeError("Unrecognized direction in DelayedValueNodeBase");
}
else
LogicError("Unrecognized direction in DelayedValueNodeBase");
}
protected:

Просмотреть файл

@ -1234,8 +1234,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else if (inputIndex == 1)
{
BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
FrameRange fr(Input(0)->GetMBLayout());
BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr);
#ifdef _DEBUG
Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout()));
#endif
@ -1368,14 +1371,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
}
// Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
}
//request matrices needed to do node function value evaluation
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
}
// TODO: method names should be CamelCase
std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
@ -1415,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_doReferenceAlignment = doreferencealign;
}
void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR)
{
msra::lattices::SeqGammarCalParam param;
param.amf = amf;
param.lmf = lmf;
param.wp = wp;
param.bMMIfactor = bMMIfactor;
param.sMBRmode = sMBR;
m_gammaCalculator.SetGammarCalculationParams(param);
}
void gettime(unsigned long long &gammatime, unsigned long long &partialtime)
{
gammatime = m_gammatime;
@ -1427,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
shared_ptr<Matrix<ElemType>> m_gammaFromLattice;
double m_frameDropThreshold;
double m_fsSmoothingWeight; // frame-sequence criterion interpolation weight --TODO: can this be done outside?
double m_seqGammarAMF;
double m_seqGammarLMF;
double m_seqGammarWP;
double m_seqGammarbMMIFactor;
double m_seqGammarUsesMBR;
bool m_doReferenceAlignment;
std::vector<shared_ptr<const msra::dbn::latticepair>> m_lattices;
msra::asr::simplesenonehmm m_hmm;

Просмотреть файл

@ -74,7 +74,7 @@
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
@ -102,7 +102,7 @@
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -137,7 +137,7 @@ struct GridDim
std::vector<cudaDeviceProp> props(numDevices);
for (int i = 0; i < numDevices; i++)
CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
#if 1 // on Linux, maxGridSize[0] gets reported as 0
#if 0 // on Linux, maxGridSize[0] gets reported as 0
for (int i = 0; i < numDevices; i++)
fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
#endif

Просмотреть файл

@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
slice.m_computeDevice = m_computeDevice;
slice.m_numRows = m_numRows;
slice.m_numCols = numCols;
slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
slice.m_elemSizeAllocated = m_elemSizeAllocated;
slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
slice.m_pArray = m_pArray;

Просмотреть файл

@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
}
// TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated.
size_t MajorIndexCount() const
{
return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
}
size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
{
@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return MajorIndexLocation() + m_numRows;
else
return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
//return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
}
size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
{

Просмотреть файл

@ -79,7 +79,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
@ -127,7 +127,7 @@
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -91,7 +91,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
<Profile>true</Profile>

Просмотреть файл

@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients,
Matrix<ElemType>& functionValues,
const ElemType learnRatePerSample,
const ElemType momentum,
const bool useNesterovMomentum
)
{
DecideAndMoveToRightDevice(*this, gradients, functionValues);
DISPATCH_MATRIX_ON_FLAG(&gradients,
if (!useNesterovMomentum)
{
DISPATCH_MATRIX_ON_FLAG(&gradients,
nullptr,
ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this,
ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this,
if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
);
}
else
{
DISPATCH_MATRIX_ON_FLAG(&gradients,
nullptr,
{/* CPU dense */
ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
ScaleAndAdd(-momentum, *this, functionValues);
ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);
// w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient,
},
{/* GPU dense */
ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
ScaleAndAdd(-momentum, *this, functionValues);
ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);
},
{ /* CPU sparse */
if (momentum != 0)
{
Matrix<ElemType> gradientCache(gradients.GetDeviceId());
gradientCache.SetValue(gradients);
gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
ScaleAndAdd(-momentum, *this, functionValues);
ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues);
}
},
{ /* GPU sparse */
if (momentum != 0)
{
Matrix<ElemType> gradientCache(gradients.GetDeviceId());
gradientCache.SetValue(gradients);
gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
ScaleAndAdd(-momentum, *this, functionValues);
ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues);
}
}
);
}
}
//both this and gradients will be changed

Просмотреть файл

@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ShiftBy(int numShift);
// TODO: all these scalars should be passed as doubles and cast down inside
void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG);
ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
void FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);

Просмотреть файл

@ -237,8 +237,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
{
static int cc = 0; if (cc++ == 0)
fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
//static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
// prepare all tensor descriptor information as needed for execution
array<size_t, 2> offsets;
@ -257,8 +257,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
{
static int cc = 0; if (cc++ == 0)
fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
//static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
array<size_t, 3> offsets;
array<SmallVector<ptrdiff_t>, 3> regularStrides, reducingStrides;
@ -275,8 +275,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
{
static int cc = 0; if (cc++ == 0)
fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
//static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
array<size_t, 4> offsets;
array<SmallVector<ptrdiff_t>, 4> regularStrides, reducingStrides;

Просмотреть файл

@ -356,26 +356,39 @@ struct latticefunctionskernels
const size_t te = ts + numframes; // end time of current unit
size_t state1step0to1 = te; // inflection point from state 0 to 1, record in state 1
//size_t state1stepm1to1 = te;
size_t state2step0to1 = te; // inflection point from state 0 to 1, record in state 2
//size_t state2stepm1to1 = te; // inflection point from state 0 to 1, record in state 2
size_t state2step1to2 = te; // inflection point from state 1 to 2, record in state 2
size_t state2step0to2 = te;
//now we only support transition from -1 to 0 or 2 for sil
float pathscore0 = fwscore ; // log pp in state 0
float pathscore1 = LOGZERO; // log pp in state 1
float pathscore2 = LOGZERO; // log pp in state 2
if(isSil)
pathscore2 = fwscore;
float pathscore0 = fwscore; // log pp in state 0
float pathscore1 = fwscore; // log pp in state 1
float pathscore2 = fwscore; // log pp in state 2
// first frame
if (ts != te) // for t = ts, initialization
{
if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted
/* if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted
{
pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts);
pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts);
}
else //for others, only -1 to 0 is permitted
pathscore0 += logLLs(senoneid0,ts); // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway
else //for others, only -1 to 0 is permitted
{
pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
}*/
pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
//state1stepm1to1 = ts;
pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
}
@ -399,17 +412,22 @@ struct latticefunctionskernels
{
pathscore2 = pathscore12;
state2step0to1 = state1step0to1; // record the inflection point
//state2stepm1to1 = state1stepm1to1;
state2step1to2 = t; // record the inflection point
state2step0to2 = te;
if (isSil)
backptrmatrix (2, t-ts-1) = 1;
}
if (isSil) // only silence have path from 0 to 2
//if (isSil) // only silence have path from 0 to 2
{
const float pathscore02 = pathscore0 + getlogtransp(transP,0,2); // log pp from state 0 to 2
if (pathscore02 >= pathscore2) // if state 0->2
{
pathscore2 = pathscore02;
backptrmatrix (2, t-ts-1) = 0;
if (isSil)
backptrmatrix (2, t-ts-1) = 0;
state2step0to2 = t;
state2step1to2 = te;
}
}
@ -422,9 +440,11 @@ struct latticefunctionskernels
{
pathscore1 = pathscore01;
state1step0to1 = t; // record the inflection point
//state1stepm1to1 = te;
if (isSil)
backptrmatrix (1, t-ts-1) = 0;
}
if (isSil) // only silence have path from 2 to 1
{
const float pathscore21 = pathscore2last + getlogtransp(transP,2,1);
@ -495,19 +515,35 @@ struct latticefunctionskernels
if (!isSil)
{
state2step0to1 += alignindex - ts; // convert to align measure
state2step1to2 += alignindex - ts;
for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment
{
size_t senoneid;
if (t < state2step0to1) // in state 0
senoneid = senoneid0;
else if(t < state2step1to2) // in state 1
senoneid = senoneid1;
else // in state 2
senoneid = senoneid2;
alignresult[t] = (unsigned short) senoneid;
}
if (state2step0to2 < te) //from 0 to 2
{
state2step0to2 += alignindex - ts;
for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment
{
size_t senoneid;
if (t < state2step0to2) // in state 0
senoneid = senoneid0;
else // in state 2
senoneid = senoneid2;
alignresult[t] = (unsigned short)senoneid;
}
}
else //from 1 to 2
{
state2step0to1 += alignindex - ts; // convert to align measure
state2step1to2 += alignindex - ts;
for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment
{
size_t senoneid;
if (state2step0to1 <alignindex - ts + te && t < state2step0to1)
senoneid = senoneid0;
else if(t < state2step1to2) // in state 1
senoneid = senoneid1;
else // in state 2
senoneid = senoneid2;
alignresult[t] = (unsigned short) senoneid;
}
}
}
else // for silence
{

Просмотреть файл

@ -70,7 +70,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -91,7 +91,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -72,7 +72,7 @@
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -100,7 +100,7 @@
<UseFullPaths>true</UseFullPaths>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
</Link>
@ -115,7 +115,7 @@
<UseFullPaths>true</UseFullPaths>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -133,7 +133,7 @@
<UseFullPaths>true</UseFullPaths>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
@ -152,7 +152,7 @@
<UseFullPaths>true</UseFullPaths>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
vector<wstring> scriptpaths;
vector<wstring> RootPathInScripts;
wstring RootPathInLatticeTocs;
vector<wstring> mlfpaths;
vector<vector<wstring>>mlfpathsmulti;
size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing
@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
expand_wildcards(thisLattice(L"numLatTocFile"), paths);
latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end());
}
RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L"");
}
//get HMM related file names
@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (!_wcsicmp(readMethod.c_str(), L"blockRandomize"))
{
// construct all the parameters we don't need, but need to be passed to the constructor...
m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap()));
m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs));
m_lattices->setverbosity(m_verbosity);
// now get the frame source. This has better randomization and doesn't create temp files
m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode));
@ -941,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (!skip)
{
// a stopgap
if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i])
{
// BUGBUG: we just found that (due to some bugs yet to be tracked down),
// the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
// This is just a stopgap, to be removed after the bugs are found and fixed
bool needRenew = true;
while (needRenew)
{
size_t framenum = m_numFramesToProcess[i];
fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
(int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
ReNewBufferForMultiIO(i);
needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i];
}
}
m_numValidFrames[i] = m_numFramesToProcess[i];
if (m_numValidFrames[i] > 0)
{
@ -972,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_extraNumSeqs = 0;
if (!m_frameMode)
{
// insert extra utterances to parallel sequences that have enough space left
// As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in.
size_t nextMinibatchUttnum = 0;
bool inserted;
// The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB.
// For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j].
for (size_t i = 0; i < m_numSeqsPerMB; i++)
for (size_t src = 0; src < m_numSeqsPerMB; )
{
while (nextMinibatchUttnum <= i)
size_t framenum = m_numFramesToProcess[src];
if (framenum == 0)
{
size_t framenum = m_numFramesToProcess[i];
inserted = false;
if (framenum > 0) // non-empty entry: see were it fits
{
// greedily search for a parallel sequence with enough space at the end to insert this utterance
for (size_t j = 0; j < m_numSeqsPerMB; j++)
{
if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps)
{
// enough space: insert it as parallel sequence [j] (instead of [i] in the next MB)
m_extraSeqsPerMB.push_back(j);
if (m_latticeBufferMultiUtt[i] != nullptr)
{
m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]);
m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]);
m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]);
}
fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i);
m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum);
src++;
continue;
}
if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum)
{
// BUGBUG: we just found that (due to some bugs yet to be tracked down),
// the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
// This is just a stopgap, to be removed after the bugs are found and fixed
fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
(int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
src++;
continue;
}
// consume it
ReNewBufferForMultiIO(i); // replace current [i] with a new one; then try again with this new one at [i]
m_numValidFrames[j] += framenum;
m_extraNumSeqs++;
inserted = true;
break;
}
bool slotFound = false;
for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot
{
if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps)
{ // found !
m_extraSeqsPerMB.push_back(des);
if (m_latticeBufferMultiUtt[src] != nullptr)
{
m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]);
m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]);
m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]);
}
fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src);
m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum);
ReNewBufferForMultiIO(src);
m_numValidFrames[des] += framenum;
m_extraNumSeqs++;
slotFound = true;
break;
}
if (!inserted)
{
nextMinibatchUttnum++; // didn't fit anywhere: done with entry [i]
}
}
if (!slotFound)
{
src++; // done with this source; try next source;
}
}

Просмотреть файл

@ -32,6 +32,9 @@ private:
intargvector m_numSeqsPerMBForAllEpochs;
size_t m_numSeqsPerMB; // requested number of parallel sequences
size_t m_mbNumTimeSteps; // number of time steps to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported)
size_t m_mbMaxNumTimeSteps; // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued )
// this is used to prevent CUDA out-of memory errors
vector<size_t> m_numFramesToProcess; // [seq index] number of frames available (left to return) in each parallel sequence
vector<size_t> m_switchFrame; /// TODO: something like the position where a new sequence starts; still supported?
vector<size_t> m_numValidFrames; // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput.

Просмотреть файл

@ -69,7 +69,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
@ -87,7 +87,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -75,7 +75,7 @@
<OpenMPSupport>true</OpenMPSupport>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;$(OpenCVLib);%(AdditionalDependencies)</AdditionalDependencies>
</Link>

Просмотреть файл

@ -71,7 +71,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -92,7 +92,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -71,7 +71,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -92,7 +92,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -72,7 +72,7 @@
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -72,7 +72,7 @@
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -70,7 +70,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -91,7 +91,7 @@
<TreatWarningAsError>true</TreatWarningAsError>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -91,7 +91,7 @@
<SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -107,7 +107,7 @@
<AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -124,7 +124,7 @@
<SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
@ -144,7 +144,7 @@
<AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>

Просмотреть файл

@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
using SGDBase::m_L2RegWeight;
using SGDBase::m_L1RegWeight;
using SGDBase::m_needAveMultiplier;
using SGDBase::m_useNesterovMomentum;
using SGDBase::m_traceLevel;
using SGDBase::m_numMBsToShowResult;
using SGDBase::m_gradientCheckSigDigit;
@ -392,8 +393,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (m_loadBestModel)
{
encoderNet->ReloadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
decoderNet->ReloadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));
encoderNet->RereadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
decoderNet->RereadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));
size_t dummyMinibatchSize = 0;
this->LoadCheckPointInfo(i - 1,
@ -721,7 +722,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//persist model and check-point info
for (size_t k = 0; k < iNumNetworks; k++)
{
nets[k]->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
nets[k]->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
nets[k]->ResetEvalTimeStamps();
}
@ -930,7 +931,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum);
}
}
}

Просмотреть файл

@ -310,7 +310,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// likewise for sequence training parameters
if (isSequenceTrainingCriterion)
{
ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign);
ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign,
m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR );
}
// --- MAIN EPOCH LOOP
@ -519,6 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
{
g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
}
bool loadedPrevModel = false;
@ -543,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
net->ReloadPersistableParameters<ElemType>(bestModelPath);
net->RereadPersistableParameters<ElemType>(bestModelPath);
LoadCheckPointInfo(i - m_learnRateAdjustInterval,
/*out*/ totalSamplesSeen,
/*out*/ learnRatePerSample,
@ -771,13 +773,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
size_t numSubminibatchesNeeded = 0;
if (m_maxSamplesInRAM < SIZE_MAX) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
{
// into how many pieces would we need to break the minibatch?
// TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
size_t estimatedMBSize = tunedMBSize * numParallelSequences;
numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
if (m_maxSamplesInRAM < SIZE_MAX)
{
// into how many pieces would we need to break the minibatch?
// TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
size_t estimatedMBSize = tunedMBSize * numParallelSequences;
numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
}
if (m_numSubminiBatches > 1)
{
numSubminibatchesNeeded = m_numSubminiBatches;
}
}
// this is non-trivial, we need a manager object to handle this
if (numSubminibatchesNeeded > 1)
@ -807,7 +816,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (numSubminibatchesNeeded > 1)
{
fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
if (m_maxSamplesInRAM < SIZE_MAX)
fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
else
fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
}
fprintf(stderr, ".\n");
@ -998,7 +1010,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
UpdateWeights(node, smoothedGradient, learnRatePerSample,
GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples,
m_L2RegWeight, m_L1RegWeight,
m_needAveMultiplier);
m_needAveMultiplier, m_useNesterovMomentum);
#ifdef _DEBUG
if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
@ -1438,7 +1450,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
int baseModelEpoch = epochNumber - 1;
net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
double learnRate = learnRatePerSample;
size_t dummyMinibatchSize = 0;
@ -1598,7 +1610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
int baseModelEpoch = epochNumber - 1;
net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
double dummyLearnRate;
double dummtPrevCriterion;
@ -2029,7 +2041,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t actualMBSize,
const double L2RegWeight,
const double L1RegWeight,
const bool needAveMultiplier)
const bool needAveMultiplier,
const bool useNesterovMomentum
)
{
// we use simple linear (instead of log linear) scaling here
const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
@ -2070,7 +2084,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (adpType == GradientsUpdateType::None)
{
smoothedGradient.NormalGrad(gradientValues, functionValues,
(ElemType)learnRatePerSample, (ElemType)momentum);
(ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum);
}
else if (adpType == GradientsUpdateType::AdaGrad ||
(adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
@ -2120,7 +2134,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const double momentumPerSample,
const size_t actualMBSize,
const double L2RegWeight, const double L1RegWeight,
const bool needAveMultiplier) const
const bool needAveMultiplier,
const bool useNesterovMomentum
) const
{
#if DUMPOUTPUT
fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
@ -2131,7 +2147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
smoothedGradient, learnRatePerSample, momentumPerSample,
actualMBSize, L2RegWeight, L1RegWeight,
needAveMultiplier);
needAveMultiplier, m_useNesterovMomentum);
node->BumpEvalTimeStamp();
}
@ -2501,6 +2517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
m_truncated = configSGD(L"truncated", false);
m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX);
m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1);
// the number of samples in each epoch (0 means, use all the samples in each epoch).
m_epochSize = configSGD(L"epochSize", (size_t)0);
@ -2520,6 +2537,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
bool useNesterovMomentum = configSGD(L"useNAG", false);
m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0);
@ -2534,6 +2553,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
m_frameDropThresh = configSGD(L"frameDropThresh", 1e-10);
m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false);
m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0);
m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);
m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector<float>{ 0.0f })));
@ -2639,6 +2663,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_momentumParam = floatargvector(L"0.9");
m_momentumSpecifiedForMBSize = m_mbSize;
}
m_useNesterovMomentum = useNesterovMomentum;
for (int i = 0; i < m_momentumParam.size(); i++)
{
if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)

Просмотреть файл

@ -111,6 +111,7 @@ protected:
intargvector m_learningRatesSpecifiedForMBSize; // 1 for per sample, m_mbSize[] for per MB
floatargvector m_momentumParam;
intargvector m_momentumSpecifiedForMBSize;
bool m_useNesterovMomentum;
// Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value.
// MB size is the number of samples across all time steps and parallel sequences.
@ -157,7 +158,11 @@ protected:
// To mitigate this issue, we adopt the sub-minibatch implementation, where
// each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM
// a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch
size_t m_numSubminiBatches;
// alternative method to specify how to split minibatches into subminibatches
// default is 1, which means no subminibatch is used
// if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1
// we divide one minibatch to m_numSubminiBatches subMinibatches
// the number of samples in each epoch (0 means, use all the samples in each epoch).
size_t m_epochSize;
@ -245,6 +250,11 @@ protected:
double m_hSmoothingWeight;
double m_frameDropThresh;
bool m_doReferenceAlign;
double m_seqGammarCalcAMF;
double m_seqGammarCalcLMF;
double m_seqGammarCalcWP;
double m_seqGammarCalcbMMIFactor;
bool m_seqGammarCalcUsesMBR;
};
template<class ElemType> class IDistGradAggregator;
@ -436,7 +446,9 @@ public:
size_t actualMBSize,
const double L2RegWeight,
const double L1RegWeight,
const bool needAveMultiplier);
const bool needAveMultiplier,
const bool useNesterovMomentum
);
protected:
// UpdateWeights - update the weights in
@ -446,7 +458,8 @@ protected:
const double momentumPerSample,
const size_t actualMBSize,
const double L2RegWeight, const double L1RegWeight,
const bool needAveMultiplier) const;
const bool needAveMultiplier,
const bool useNesterovMomentum) const;
void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;

Просмотреть файл

@ -44,7 +44,7 @@
<SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>

Просмотреть файл

@ -11,6 +11,23 @@
#pragma warning (disable: 4127) // conditional expression is constant
namespace msra { namespace lattices {
struct SeqGammarCalParam{
double amf;
double lmf;
double wp;
double bMMIfactor;
bool sMBRmode;
SeqGammarCalParam()
{
amf = 14.0;
lmf = 14.0;
wp = 0.0;
bMMIfactor = 0.0;
sMBRmode = false;
}
};
template<class ElemType>
class GammaCalculation
{
@ -19,9 +36,9 @@ namespace msra { namespace lattices {
GammaCalculation() : cpumode(false)
{
initialmark = false;
lmf = 14.0f; // Note that 9 was best for Fisher --these should best be configurable
lmf = 7.0f; // Note that 9 was best for Fisher --these should best be configurable
wp = 0.0f;
amf = 14.0f;
amf = 7.0f;
boostmmifactor = 0.0f;
seqsMBRmode = false;
}
@ -30,6 +47,9 @@ namespace msra { namespace lattices {
}
//========================================
// Sec. 1 init functions
//========================================
void init(msra::asr::simplesenonehmm hset, int DeviceId)
{
m_deviceid = DeviceId;
@ -47,7 +67,21 @@ namespace msra { namespace lattices {
}
}
//========================================
// Sec. 2 set functions
//========================================
void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam)
{
lmf = (float)gammarParam.lmf;
amf = (float)gammarParam.amf;
wp = (float)gammarParam.wp;
seqsMBRmode = gammarParam.sMBRmode;
boostmmifactor = (float)gammarParam.bMMIfactor;
}
//========================================
// Sec. 3 calculation functions
//========================================
void calgammaformb( Microsoft::MSR::CNTK::Matrix<ElemType>& functionValues,
std::vector<shared_ptr<const msra::dbn::latticepair>> &lattices,
const Microsoft::MSR::CNTK::Matrix<ElemType>& loglikelihood,

Просмотреть файл

@ -442,6 +442,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
LogicError("invalid backpointer resulting in state index out of range");
int bp = (int) backpointers(j,t); // save the backpointer before overwriting it (gammas and backpointers are aliases of each other)
//thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
if (!returnsenoneids) // return binary gammas (for MMI; this mode is compatible with softalignmode)
for (size_t i = js; i < je; i++)
loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO;

Просмотреть файл

@ -743,8 +743,8 @@ namespace msra { namespace lattices {
double totalfwscore = 0.0f;
if (!parallelstate->emulation)
{
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
if (verbosity>=2)
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);

Просмотреть файл

@ -67,7 +67,7 @@ speechTrain = [
// LSTM cell
# TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, numSteps=1, insertedDim=2)
PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1)
PastValue1 = PastValue
#PastValue1 = PastValueShift
dh = PastValue1(outputDim, output); // hidden state(t-1)

Просмотреть файл

@ -56,6 +56,9 @@ makebuildinfo()
if [ ! -z "$CUB_PATH" ]; then
printf "#define _CUB_PATH_ \"%s\"\n" $CUB_PATH >> $target
fi
if [ ! -z "$CUDNN_PATH" ]; then
printf "#define _CUDNN_PATH_ \"%s\"\n" $CUDNN_PATH >> $target
fi
printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE >> $target
printf "#endif\n" >> $target
}