Merge branch 'master' of https://git.codeplex.com/cntk
Conflicts: .gitignore Common/Include/fileutil.h
This commit is contained in:
Коммит
0443dc818d
|
@ -166,3 +166,4 @@ bin/
|
|||
LOG
|
||||
*.log
|
||||
core
|
||||
*.lyx#
|
||||
|
|
|
@ -31,6 +31,12 @@
|
|||
#endif//__WINDOWS__
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
#include "mpi.h"
|
||||
#endif
|
||||
extern int myRank;
|
||||
extern int numProcs;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// BestGpu class
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -86,6 +92,7 @@ public:
|
|||
void Init();
|
||||
void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
|
||||
bool DeviceAllowed(int device);
|
||||
void DisallowDevice(int device) { m_allowedDevices &= ~(1 << device); }
|
||||
void AllowAll(); // reset to allow all GPUs (no allowed list)
|
||||
bool UseMultiple(); // using multiple GPUs?
|
||||
int GetDevice(BestGpuFlags flags = bestGpuNormal); // get a single device
|
||||
|
@ -120,8 +127,39 @@ DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
|
|||
}
|
||||
if (!_stricmp(val.c_str(), "Auto"))
|
||||
{
|
||||
#ifdef MPI_SUPPORT
|
||||
// make sure deviceId is unique among processes on the same machine
|
||||
g_bestGpu->AllowAll();
|
||||
std::string MyName(getenv("COMPUTERNAME"));
|
||||
for (int i = 0; i < numProcs; i++)
|
||||
{
|
||||
DEVICEID_TYPE yourDeviceId = deviceId;
|
||||
if (myRank == i)
|
||||
{
|
||||
std::vector<int> devices = g_bestGpu->GetDevices(1);
|
||||
deviceId = yourDeviceId = (DEVICEID_TYPE)devices[0];
|
||||
}
|
||||
MPI_Bcast(&yourDeviceId, 1, MPI_INT, i, MPI_COMM_WORLD);
|
||||
{
|
||||
INT32 YourSize = (INT32)MyName.length();
|
||||
MPI_Bcast(&YourSize,1,MPI_INT,i,MPI_COMM_WORLD);
|
||||
vector<char> YourName(YourSize+1);
|
||||
if (myRank == i)
|
||||
copy(MyName.begin(), MyName.end(), YourName.begin());
|
||||
MPI_Bcast(YourName.data(), YourSize + 1, MPI_CHAR, i, MPI_COMM_WORLD);
|
||||
if (myRank != i)
|
||||
{
|
||||
if (!_strcmpi(MyName.data(), YourName.data()))
|
||||
{
|
||||
g_bestGpu->DisallowDevice(yourDeviceId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
std::vector<int> devices = g_bestGpu->GetDevices(1);
|
||||
deviceId = (DEVICEID_TYPE)devices[0];
|
||||
#endif
|
||||
}
|
||||
else if (!_stricmp(val.c_str(), "All"))
|
||||
{
|
||||
|
@ -466,6 +504,9 @@ void BestGpu::QueryNvmlData()
|
|||
}
|
||||
}
|
||||
|
||||
if (curPd == NULL)
|
||||
continue;
|
||||
|
||||
// Get the memory usage, will only work for TCC drivers
|
||||
result = nvmlDeviceGetMemoryInfo(device, &memory);
|
||||
if (NVML_SUCCESS != result)
|
||||
|
|
|
@ -379,6 +379,8 @@ template <> const wchar_t* GetFormatString(float);
|
|||
template <> const wchar_t* GetFormatString(double);
|
||||
template <> const wchar_t* GetFormatString(size_t);
|
||||
template <> const wchar_t* GetFormatString(long long);
|
||||
template <> const wchar_t* GetFormatString(const char*);
|
||||
template <> const wchar_t* GetFormatString(const wchar_t*);
|
||||
|
||||
// GetScanFormatString - get the format string for a particular type
|
||||
template <typename T>
|
||||
|
|
|
@ -78,8 +78,10 @@ template <> const wchar_t* GetFormatString(unsigned int) {return L" %u";}
|
|||
//template <> const wchar_t* GetFormatString(unsigned long) {return L" %lu";}
|
||||
template <> const wchar_t* GetFormatString(float) {return L" %.9g";}
|
||||
template <> const wchar_t* GetFormatString(double) {return L" %.17g";}
|
||||
template <> const wchar_t* GetFormatString(size_t) { return L" %llu"; }
|
||||
template <> const wchar_t* GetFormatString(size_t) {return L" %llu";}
|
||||
template <> const wchar_t* GetFormatString(long long) {return L" %lli";}
|
||||
template <> const wchar_t* GetFormatString(const char*) {return L" %hs";}
|
||||
template <> const wchar_t* GetFormatString(const wchar_t*) {return L" %ls";}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// fgetText() specializations for fwscanf differences: get a value from a text file
|
||||
|
|
|
@ -50,6 +50,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_convertLabelsToTargets = false;
|
||||
|
||||
m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
|
||||
|
||||
if (m_numberOfuttsPerMinibatch < 1)
|
||||
{
|
||||
LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
|
||||
}
|
||||
|
||||
if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
|
||||
{
|
||||
LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
|
||||
}
|
||||
|
||||
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
|
||||
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
|
||||
m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
|
||||
|
|
|
@ -64,6 +64,7 @@ template<class ElemType>
|
|||
void CNTKEval<ElemType>::LoadModel(const std::wstring& modelFileName)
|
||||
{
|
||||
DEVICEID_TYPE deviceId = DeviceFromConfig(m_config);
|
||||
fprintf(stderr, "DeviceID=%d\n", (int)deviceId);
|
||||
if (m_net != NULL)
|
||||
delete m_net;
|
||||
m_net = new ComputationNetwork<ElemType>(deviceId);
|
||||
|
|
|
@ -118,9 +118,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
//dump all nodes in the network to file
|
||||
void DumpAllNodesToFile(const bool printValues, const std::wstring outputFile)
|
||||
void DumpAllNodesToFile(const bool printValues, const std::wstring outputFile, const bool validateBeforeDump = true)
|
||||
{
|
||||
ValidateNetwork(); //some internal values in the nodes are computed during validation
|
||||
if (validateBeforeDump)
|
||||
ValidateNetwork(); //some internal values in the nodes are computed during validation
|
||||
|
||||
File fstream(outputFile, FileOptions::fileOptionsText | FileOptions::fileOptionsWrite);
|
||||
|
||||
|
@ -1745,8 +1746,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
for (ComputationNodePtr node : FinalCriterionNodes())
|
||||
{
|
||||
PrintComputationTree(node, false);
|
||||
if(!allowFragment) FormRecurentLoops(node);
|
||||
PrintComputationTree(node, false);
|
||||
size_t actualMBSize = this->GetActualMBSize();
|
||||
this->SetActualMiniBatchSize(actualMBSize);
|
||||
ValidateNetwork(node);
|
||||
|
@ -1759,8 +1760,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// now output nodes
|
||||
if (OutputNodes().size() > 0)
|
||||
{
|
||||
for (ComputationNodePtr node : OutputNodes())
|
||||
ValidateNetwork(node);
|
||||
for (ComputationNodePtr node : OutputNodes())
|
||||
{
|
||||
if (!allowFragment) FormRecurentLoops(node);
|
||||
ValidateNetwork(node);
|
||||
}
|
||||
}
|
||||
else if (!allowFragment)
|
||||
{
|
||||
|
@ -1769,8 +1773,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// now evaluation nodes
|
||||
if (EvaluationNodes().size() > 0)
|
||||
{
|
||||
for (ComputationNodePtr node : EvaluationNodes())
|
||||
ValidateNetwork(node);
|
||||
for (ComputationNodePtr node : EvaluationNodes())
|
||||
{
|
||||
if (!allowFragment) FormRecurentLoops(node);
|
||||
ValidateNetwork(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2039,6 +2046,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
std::vector<ComputationNodePtr> sourceLoopNodes;
|
||||
getStrongSCC(rootNode);
|
||||
std::list<ComputationNodePtr>& nodes = GetEvalOrder(rootNode, sourceLoopNodes);
|
||||
std::list<ComputationNodePtr> nodesForGrad;
|
||||
|
||||
/// debug purpose
|
||||
for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
|
||||
|
@ -2080,7 +2088,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
|
||||
{
|
||||
// sort the recurrent nodes in their ascending name, which is the same as visiting nodes in G^R
|
||||
if ((*iter).m_recurrentNodes.size() > 1 && (*iter).m_recurrentNodesForForward.size() == 0)
|
||||
(*iter).m_recurrentNodesForForward.clear();
|
||||
if ((*iter).m_recurrentNodes.size() > 1)
|
||||
{
|
||||
std::list<ComputationNodePtr> result;
|
||||
std::unordered_set<ComputationNodePtr> visited;
|
||||
|
@ -2112,7 +2121,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
result.pop_front();
|
||||
}
|
||||
|
||||
|
||||
(*iter).m_recurrentNodes = (*iter).m_recurrentNodesForForward;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2124,12 +2133,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
std::list<ComputationNodePtr> noRecurrentNodes;
|
||||
|
||||
noRecurrentNodes = rootNode->ReshuffleNodes(recurrentNodes);
|
||||
|
||||
ReorderLoops(nodes, recurrentNodes, noRecurrentNodes);
|
||||
|
||||
nodes.sort(IsSmaller);
|
||||
|
||||
ReorderLoops(nodes, recurrentNodes, noRecurrentNodes);
|
||||
|
||||
m_cacheEvalOrders[rootNode] = nodes;
|
||||
nodesForGrad = nodes;
|
||||
nodesForGrad.reverse();
|
||||
m_cacheGradientCalcOrders[rootNode] = nodesForGrad;
|
||||
|
||||
#ifdef DISPLAY_DEBUG
|
||||
fprintf(stderr, "Reordered nodes\n");
|
||||
|
@ -2149,13 +2161,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
std::list<ComputationNodePtr> vTmp;
|
||||
std::list<ComputationNodePtr> vRecurrentTmp;
|
||||
int prevId = -1;
|
||||
//int prevId = -1;
|
||||
vector<bool> accessed;
|
||||
accessed.assign(m_recurrentInfo.size(),false);
|
||||
for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
|
||||
{
|
||||
int iId = FindInRecurrentLoop(*nodeIter);
|
||||
if (iId >= 0)
|
||||
{
|
||||
if (prevId != iId && vRecurrentTmp.size() > 0)
|
||||
|
||||
if (! accessed[iId])
|
||||
{
|
||||
newList.insert(newList.end(), m_recurrentInfo[iId].m_recurrentNodes.begin(), m_recurrentInfo[iId].m_recurrentNodes.end());
|
||||
accessed[iId] = true;
|
||||
}
|
||||
|
||||
/*if (prevId != iId && vRecurrentTmp.size() > 0)
|
||||
{
|
||||
newList.insert(newList.end(), vRecurrentTmp.begin(), vRecurrentTmp.end());
|
||||
vRecurrentTmp.clear();
|
||||
|
@ -2169,11 +2190,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
vRecurrentTmp.push_back(*nodeIter);
|
||||
|
||||
prevId = iId;
|
||||
prevId = iId;*/
|
||||
}
|
||||
else
|
||||
{
|
||||
vTmp.push_back(*nodeIter);
|
||||
//vTmp.push_back(*nodeIter);
|
||||
newList.push_back(*nodeIter);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4743,14 +4743,18 @@ protected: \
|
|||
virtual void EvaluateThisNode(const size_t timeIdxInSeq)
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
|
||||
|
||||
Matrix<ElemType> sliceMask = Matrix<ElemType>();
|
||||
if(m_dropoutRate > 0)
|
||||
{
|
||||
m_maskOfDropout.Resize(m_functionValues.GetNumRows(), m_functionValues.GetNumCols());
|
||||
FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
|
||||
m_maskOfDropout.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
|
||||
sliceMask = m_maskOfDropout.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
}
|
||||
}
|
||||
|
||||
sliceOutputValue = FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
|
||||
|
||||
EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
|
||||
}
|
||||
|
|
|
@ -45,10 +45,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
IExecutionEngine<ElemType>* executionEngine,
|
||||
const std::wstring& networkConfig,
|
||||
const std::string& configParams,
|
||||
const std::wstring& dumpFileName,
|
||||
DEVICEID_TYPE deviceId=AUTOPLACEMATRIX)
|
||||
{
|
||||
m_executionEngine=executionEngine;
|
||||
m_networkConfig=networkConfig;
|
||||
m_dumpFileName = dumpFileName;
|
||||
m_initialConfig=configParams;
|
||||
m_deviceId=deviceId;
|
||||
m_net=&(executionEngine->GetComputationNetwork());
|
||||
|
@ -69,6 +71,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
ConfigParameters newConfig;
|
||||
ConfigValue networkConfig = config("networkDescription","");
|
||||
ConfigValue dumpFileName = config("dumpFileName", "");
|
||||
DEVICEID_TYPE deviceId = DeviceFromConfig(config);
|
||||
unsigned long randomSeedOffset = config("randomSeedOffset","0");
|
||||
auto executionEngine = new SynchronousExecutionEngine<ElemType>(deviceId, randomSeedOffset);
|
||||
|
@ -142,7 +145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
Init(executionEngine, networkConfig, newConfig, deviceId);
|
||||
Init(executionEngine, networkConfig, newConfig, dumpFileName, deviceId);
|
||||
}
|
||||
|
||||
virtual ~NDLBuilder()
|
||||
|
@ -196,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_script.FileParse(fileContents);
|
||||
|
||||
NDLUtil<ElemType> ndlUtil(m_net);
|
||||
ndlUtil.ProcessNDLScript(&m_script, ndlPassAll, nullptr, true);
|
||||
ndlUtil.ProcessNDLScript(&m_script, ndlPassAll, nullptr, true, m_dumpFileName);
|
||||
}
|
||||
|
||||
// SetFromConfig - Set the NDL script from a configuration string value
|
||||
|
@ -222,6 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNetwork<ElemType>* m_net;
|
||||
IExecutionEngine<ElemType>* m_executionEngine;
|
||||
std::wstring m_networkConfig;
|
||||
std::wstring m_dumpFileName;
|
||||
std::string m_initialConfig;
|
||||
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
|
|
|
@ -88,7 +88,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// skipThrough - [in/out] for iterative processing, a pointer to an array of NDLNode*, one for each pass
|
||||
// the pointer will be updated to last node processed for that pass, can be NULL if all node processing is desired
|
||||
// fullValidate - validate as a complete network? (false if this might be a snippet of a full network)
|
||||
void ProcessNDLScript(NDLScript<ElemType>* script, NDLPass ndlPassUntil=ndlPassAll, NDLNode<ElemType>** skipThrough=nullptr, bool fullValidate = false)
|
||||
void ProcessNDLScript(NDLScript<ElemType>* script, NDLPass ndlPassUntil = ndlPassAll, NDLNode<ElemType>** skipThrough = nullptr, bool fullValidate = false, const std::wstring& dumpFileName = L"")
|
||||
{
|
||||
// if we don't have a script yet, don't bother
|
||||
if (script == nullptr)
|
||||
|
@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (NDLPass ndlPass=ndlPassInitial;ndlPass <= ndlPassUntil;++ndlPass)
|
||||
{
|
||||
NDLNode<ElemType>* skipThroughNode = skipThrough?*skipThrough:nullptr;
|
||||
lastNode = ProcessPassNDLScript(script, ndlPass, skipThroughNode, fullValidate);
|
||||
lastNode = ProcessPassNDLScript(script, ndlPass, skipThroughNode, fullValidate, dumpFileName);
|
||||
if (skipThrough)
|
||||
{
|
||||
*skipThrough = lastNode;
|
||||
|
@ -119,13 +119,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// skipThrough - for iterative processing, skip through this node in the script (used for in-line MEL processing)
|
||||
// fullValidate - validate as a complete network? (false if this might be a snippet of a full network)
|
||||
// returns: last NDL node processed
|
||||
NDLNode<ElemType>* ProcessPassNDLScript(NDLScript<ElemType>* script, NDLPass ndlPass, NDLNode<ElemType>* skipThrough=nullptr, bool fullValidate = false)
|
||||
NDLNode<ElemType>* ProcessPassNDLScript(NDLScript<ElemType>* script, NDLPass ndlPass, NDLNode<ElemType>* skipThrough = nullptr, bool fullValidate = false, const std::wstring& dumpFileName = L"")
|
||||
{
|
||||
if (ndlPass == ndlPassFinal)
|
||||
{
|
||||
// make sure to clear the caches so we pick up the new nodes
|
||||
m_net->ClearCaches();
|
||||
// validate the network
|
||||
if (dumpFileName != L"")
|
||||
m_net->DumpAllNodesToFile(false, dumpFileName, false);
|
||||
m_net->ValidateNetwork(!fullValidate);
|
||||
}
|
||||
SynchronousNodeEvaluator<ElemType> ndlEvaluator(*m_net);
|
||||
|
|
|
@ -31,15 +31,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType>
|
||||
void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb)
|
||||
{
|
||||
size_t rv = 0;
|
||||
if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it)
|
||||
{
|
||||
MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
|
||||
size_t nCols = mat.GetNumCols();
|
||||
size_t col_start = (nCols * myRank)/ numProcs;
|
||||
size_t col_start = (nCols * myRank) / numProcs;
|
||||
size_t col_end = (nCols*(myRank + 1)) / numProcs;
|
||||
if (col_end > nCols) col_end = nCols; // this shouldn't happen
|
||||
MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
|
||||
mat.SetValue(tmp);
|
||||
if (col_end == col_start)
|
||||
{
|
||||
MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
|
||||
mat.SetValue(tmp);
|
||||
}
|
||||
else
|
||||
{
|
||||
MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
|
||||
mat.SetValue(tmp);
|
||||
}
|
||||
if (0 == rv)
|
||||
{
|
||||
rv = mat.GetNumCols();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (rv != mat.GetNumCols())
|
||||
throw std::logic_error("Uneven number of columns among inputs.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -537,9 +555,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
net.BuildPTaskGraph();
|
||||
}
|
||||
|
||||
for (int i=int(startEpoch); i<int(m_maxEpochs); i++)
|
||||
for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
|
||||
{
|
||||
auto t_start_epoch = clock();
|
||||
auto t_start_epoch = clock();
|
||||
|
||||
// set other information to inputMatrices that can contrain information
|
||||
// used for class-based LM for clustring information
|
||||
|
@ -547,24 +565,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
//set dropout rate
|
||||
SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
|
||||
|
||||
|
||||
//learning rate adjustment
|
||||
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
|
||||
{
|
||||
learnRatePerSample = m_learningRatesPerSample[i];
|
||||
{
|
||||
learnRatePerSample = m_learningRatesPerSample[i];
|
||||
setMomentum(m_momentumInputPerMB[i]);
|
||||
}
|
||||
else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
|
||||
}
|
||||
else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
|
||||
{
|
||||
ElemType largestPrevLearnRatePerSample = prevLearnRates[0];
|
||||
for (int j=1; j<m_numPrevLearnRates; j++)
|
||||
for (int j = 1; j < m_numPrevLearnRates; j++)
|
||||
{
|
||||
largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
|
||||
}
|
||||
|
||||
//return a reasonable learning rate based on the initial mbsize
|
||||
learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes,
|
||||
labelNodes,criterionNodes,evaluationNodes, inputMatrices,learnableNodes,smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);
|
||||
labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);
|
||||
|
||||
prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample; //save per sample learn rate to support changeable mbsize
|
||||
}
|
||||
|
@ -573,18 +591,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
if (learnRatePerSample < m_minLearnRate)
|
||||
{
|
||||
fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i+1, learnRatePerSample, m_minLearnRate);
|
||||
fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
|
||||
if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
|
||||
net.SaveToFile(m_modelPath);
|
||||
break;
|
||||
}
|
||||
|
||||
TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes,
|
||||
criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients,
|
||||
#ifdef MPI_SUPPORT
|
||||
INT32 mySamples = (INT32)
|
||||
#endif
|
||||
TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
|
||||
criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
|
||||
epochCriterion, epochEvalErrors, totalSamplesSeen);
|
||||
|
||||
auto t_end_epoch = clock();
|
||||
ElemType epochTime = ElemType(1.0)*(t_end_epoch-t_start_epoch)/(CLOCKS_PER_SEC);
|
||||
ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
|
||||
|
||||
fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion);
|
||||
if (epochEvalErrors.size() == 1)
|
||||
|
@ -604,21 +625,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
#ifdef MPI_SUPPORT
|
||||
// model reduction and averaging
|
||||
if ( numProcs > 0 )
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
|
||||
if (numProcs > 0)
|
||||
{
|
||||
ComputationNodePtr node = (*nodeIter);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
|
||||
ElemType *px = mat.CopyToArray();
|
||||
size_t nx = mat.GetNumElements();
|
||||
vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
|
||||
// TODO: Replace this with the reduction-shuffle-dance
|
||||
MPI_Reduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
|
||||
if (myRank == 0)
|
||||
transform(py.begin(), py.end(), py.begin(), [](ElemType&val)->ElemType{return val / (ElemType)numProcs; });
|
||||
MPI_Bcast(&(py[0]), nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
|
||||
delete px;
|
||||
ElemType factor; // weight for the parameter of my model
|
||||
{
|
||||
// compute total minibatch size
|
||||
INT32 allSamples = 0;
|
||||
MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
if (allSamples == 0) allSamples = 1;
|
||||
|
||||
factor = (ElemType)mySamples / (ElemType)allSamples;
|
||||
}
|
||||
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
|
||||
{
|
||||
ComputationNodePtr node = (*nodeIter);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
|
||||
|
||||
// weight model by relative size of minibatch samples (and number of processors, for averaging)
|
||||
ElemType *px = mat.CopyToArray();
|
||||
size_t nx = mat.GetNumElements();
|
||||
transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; });
|
||||
|
||||
// TODO: Replace default Allreduce with the reduction-shuffle-dance
|
||||
vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
|
||||
MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
|
||||
delete px;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -932,7 +966,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);
|
||||
}
|
||||
|
||||
void TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode,
|
||||
size_t TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode,
|
||||
const int epochNumber, const size_t epochSize,
|
||||
IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
|
||||
const std::vector<ComputationNodePtr>& FeatureNodes,
|
||||
|
@ -1006,6 +1040,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
UpdateEvalTimeStamps(labelNodes);
|
||||
|
||||
size_t actualMBSize = net.GetActualMBSize();
|
||||
if (0 == actualMBSize)
|
||||
continue;
|
||||
|
||||
net.SetActualMiniBatchSize(actualMBSize);
|
||||
net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
|
||||
|
@ -1151,6 +1187,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
|
||||
}
|
||||
}
|
||||
return totalEpochSamples;
|
||||
}
|
||||
public:
|
||||
// UpdateWeightsS - static version of UpdateWeights()
|
||||
|
|
|
@ -200,12 +200,6 @@
|
|||
</CustomBuildStep>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<Text Include="config.txt">
|
||||
<DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
|
||||
<DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
|
||||
</Text>
|
||||
<Text Include="DefaultMacros.txt" />
|
||||
<Text Include="modelEditor.txt" />
|
||||
<Text Include="modelEditorFromScratch.txt" />
|
||||
|
|
|
@ -146,9 +146,6 @@
|
|||
<Text Include="modelEditorFromScratch.txt">
|
||||
<Filter>Model Editing</Filter>
|
||||
</Text>
|
||||
<Text Include="config.txt">
|
||||
<Filter>Main</Filter>
|
||||
</Text>
|
||||
<Text Include="DefaultMacros.txt">
|
||||
<Filter>Main</Filter>
|
||||
</Text>
|
||||
|
|
|
@ -95,6 +95,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_numRows = 0;
|
||||
m_numCols = 0;
|
||||
m_elemSizeAllocated = 0;
|
||||
m_compIndexSize = 0;
|
||||
m_externalBuffer = false;
|
||||
m_computeDevice = CPUDEVICE;
|
||||
m_nz = 0;
|
||||
|
@ -181,11 +182,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
throw std::logic_error("CPUSparseMatrix: unsupported SetValue() call.");
|
||||
}
|
||||
|
||||
if(m_elemSizeAllocated < m_nz +1) {
|
||||
throw std::logic_error("CPUSparseMatrix: allocated size is too small.");
|
||||
if(m_elemSizeAllocated < m_nz +1) //automatic resize
|
||||
{
|
||||
Resize(m_numRows, m_numCols, m_nz + 100); //allocate 100 more elelemnts and keep existing values
|
||||
}
|
||||
|
||||
if(rIdx < 0 || rIdx >= m_numRows) {
|
||||
if(rIdx < 0 || rIdx >= m_numRows)
|
||||
{
|
||||
throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id");
|
||||
}
|
||||
|
||||
|
@ -228,43 +231,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t size)
|
||||
void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
|
||||
{
|
||||
m_nz = 0;
|
||||
m_colIdx = -1;
|
||||
size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
|
||||
bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize);
|
||||
|
||||
m_numRows = numRows;
|
||||
m_numCols = numCols;
|
||||
|
||||
if(m_elemSizeAllocated < size)
|
||||
m_numCols = numCols;
|
||||
|
||||
if (reallocate)
|
||||
{
|
||||
m_elemSizeAllocated = size;
|
||||
if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
|
||||
if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
|
||||
{
|
||||
if(m_pArray != NULL)
|
||||
ElemType *pArray = new ElemType[numNZElemToReserve];
|
||||
size_t *unCompIndex = new size_t[numNZElemToReserve];
|
||||
size_t *compIndex = new size_t[newCompIndexSize];
|
||||
|
||||
if (keepExistingValues && m_nz > 0)
|
||||
{
|
||||
memcpy(pArray, m_pArray, sizeof(ElemType)*m_nz);
|
||||
memcpy(unCompIndex, m_unCompIndex, sizeof(size_t)*m_nz);
|
||||
memcpy(compIndex, m_compIndex, sizeof(size_t)*m_compIndexSize);
|
||||
}
|
||||
|
||||
if (m_pArray != NULL)
|
||||
delete[] m_pArray;
|
||||
if(m_unCompIndex != NULL)
|
||||
if (m_unCompIndex != NULL)
|
||||
delete[] m_unCompIndex;
|
||||
if(m_compIndex != NULL)
|
||||
delete[] m_compIndex;
|
||||
|
||||
//int len = m_format == MatrixFormat::matrixFormatSparseCSC ? numCols : numRows;
|
||||
size_t len = numCols > numRows ? numCols : numRows;
|
||||
m_pArray = new ElemType[size];
|
||||
m_unCompIndex = new size_t[size];
|
||||
m_compIndex = new size_t[len+1];
|
||||
|
||||
}
|
||||
if (m_compIndex != NULL)
|
||||
delete[] m_compIndex;
|
||||
|
||||
m_pArray = pArray;
|
||||
m_unCompIndex = unCompIndex;
|
||||
m_compIndex = compIndex;
|
||||
}
|
||||
else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
if(m_blockVal != NULL)
|
||||
ElemType *blockVal = new ElemType[numNZElemToReserve];
|
||||
size_t *blockIds = new size_t[newCompIndexSize];
|
||||
|
||||
if (keepExistingValues && m_elemSizeAllocated > 0)
|
||||
{
|
||||
memcpy(blockVal, m_blockVal, sizeof(ElemType)*m_elemSizeAllocated);
|
||||
memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
|
||||
}
|
||||
|
||||
if (m_blockVal != NULL)
|
||||
delete[] m_blockVal;
|
||||
if(m_blockIds != NULL)
|
||||
delete[] m_blockIds;
|
||||
|
||||
size_t max = numCols > numRows ? numCols : numRows;
|
||||
m_blockVal = new ElemType[size];
|
||||
m_blockIds = new size_t[max];
|
||||
m_blockVal = blockVal;
|
||||
m_blockIds = blockIds;
|
||||
}
|
||||
|
||||
m_elemSizeAllocated = numNZElemToReserve;
|
||||
m_compIndexSize = newCompIndexSize;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -274,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
m_nz = 0;
|
||||
m_colIdx = -1;
|
||||
m_compIndexSize = 0;
|
||||
m_blockSize = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
int GetComputeDeviceId() const {return -1;}
|
||||
|
||||
void Resize(const size_t numRows, const size_t numCols, size_t size = 0);
|
||||
void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
|
||||
void Reset();
|
||||
|
||||
public:
|
||||
|
@ -133,6 +133,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
private:
|
||||
int m_colIdx; //used to SetValue()
|
||||
size_t m_compIndexSize;
|
||||
|
||||
//non-zero values are stored in m_pArray
|
||||
size_t *m_unCompIndex; //row/col ids in CSC/CSR format
|
||||
size_t *m_compIndex; //begin ids of col/row in CSC/CSR format
|
||||
|
|
|
@ -85,6 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void SetOwnBuffer(bool own) {m_externalBuffer = !own;}
|
||||
wchar_t* GetMatrixName() const { return m_matrixName; }
|
||||
size_t NzCount() const {return m_nz;}
|
||||
void SetNzCount(const size_t nz) { m_nz = nz; }
|
||||
size_t GetSizeAllocated() const {return m_elemSizeAllocated; }
|
||||
void SetMatrixName(const wchar_t* s)
|
||||
{
|
||||
|
|
|
@ -130,6 +130,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
deepCopy.PrepareDevice();
|
||||
|
||||
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format);
|
||||
m_nz = deepCopy.m_nz;
|
||||
CUDACALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDACALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDACALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
|
@ -199,6 +200,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
|
||||
GPUSPARSE_INDEX_TYPE *h_CSRRow, *h_Col;
|
||||
cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
|
||||
cpuSparseMatrix.SetNzCount(GetNumNZElements());
|
||||
|
||||
PrepareDevice();
|
||||
h_CSRRow = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
|
||||
|
@ -219,6 +221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
|
||||
GPUSPARSE_INDEX_TYPE *h_CSCCol, *h_Row;
|
||||
cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
|
||||
cpuSparseMatrix.SetNzCount(GetNumNZElements());
|
||||
|
||||
PrepareDevice();
|
||||
h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numCols + 1];
|
||||
|
@ -322,6 +325,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
outMatrix.ChangeDeviceTo(GetComputeDeviceId());
|
||||
outMatrix.Resize(m_numRows, m_numCols, m_nz,newFormat);
|
||||
outMatrix.SetNzCount(m_nz);
|
||||
|
||||
if (oldFormat == matrixFormatSparseCSR && newFormat == matrixFormatSparseCSC)
|
||||
{
|
||||
|
@ -475,6 +479,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CUDACALL(cudaEventDestroy(done));
|
||||
|
||||
Resize(numRows, numCols, nnzTotalDevHostPtr, matrixFormat);
|
||||
SetNzCount(nnzTotalDevHostPtr);
|
||||
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
|
||||
|
@ -605,6 +610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly /*= true*/)
|
||||
{
|
||||
Resize(a.m_numRows, a.m_numCols, a.m_nz, a.m_format, growOnly);
|
||||
SetNzCount(a.m_nz);
|
||||
|
||||
CUDACALL(cudaMemcpy(MajorIndexLocation(), a.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDACALL(cudaMemcpy(SecondaryIndexLocation(), a.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
|
@ -630,30 +636,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ)
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
|
||||
{
|
||||
Resize(numRows, numCols, numNZ, GetFormat(), true);
|
||||
Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly);
|
||||
}
|
||||
|
||||
//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
|
||||
//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
|
||||
{
|
||||
m_numRows = numRows;
|
||||
m_numCols = numCols;
|
||||
m_nz = numNZ;
|
||||
|
||||
if (matrixFormat == MatrixFormat::matrixFormatSparseCSC || matrixFormat == MatrixFormat::matrixFormatSparseCSR)
|
||||
{
|
||||
bool reallocate = (m_totalBufferSizeAllocated < BufferSizeNeeded() || (!growOnly && m_totalBufferSizeAllocated > BufferSizeNeeded()));
|
||||
size_t bufferSizeNeeded = BufferSizeNeeded(numNZElemToReserve);
|
||||
bool reallocate = (m_totalBufferSizeAllocated < bufferSizeNeeded || (!growOnly && m_totalBufferSizeAllocated > bufferSizeNeeded));
|
||||
|
||||
if (reallocate)
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
throw logic_error("Cannot Resize since the buffer is managed externally.");
|
||||
|
||||
m_totalBufferSizeAllocated = BufferSizeNeeded();
|
||||
m_elemSizeAllocated = numNZ;
|
||||
|
||||
if (m_pArray != nullptr)
|
||||
CUDACALL(cudaFree(m_pArray));
|
||||
if (m_block2Id != nullptr)
|
||||
|
@ -663,21 +668,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
PrepareDevice();
|
||||
|
||||
CUDACALL(cudaMalloc((void **)&m_pArray, m_totalBufferSizeAllocated));
|
||||
CUDACALL(cudaMalloc((void **)&m_pArray, bufferSizeNeeded));
|
||||
CUDACALL(cudaMalloc((void **)&m_block2Id, sizeof(size_t)*(numCols * 2)));
|
||||
CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*(numCols * 2)));
|
||||
|
||||
m_totalBufferSizeAllocated = bufferSizeNeeded;
|
||||
m_elemSizeAllocated = numNZElemToReserve;
|
||||
}
|
||||
}
|
||||
else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
if (m_blockVal != nullptr)
|
||||
CUDACALL(cudaFree(m_blockVal));
|
||||
if (m_blockIds != nullptr)
|
||||
CUDACALL(cudaFree(m_blockIds));
|
||||
PrepareDevice();
|
||||
CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZ));
|
||||
int max = numCols > numRows ? numCols : numRows;
|
||||
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*max));
|
||||
if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
|
||||
{
|
||||
if (m_blockVal != nullptr)
|
||||
CUDACALL(cudaFree(m_blockVal));
|
||||
if (m_blockIds != nullptr)
|
||||
CUDACALL(cudaFree(m_blockIds));
|
||||
PrepareDevice();
|
||||
CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
|
||||
int max = numCols > numRows ? numCols : numRows;
|
||||
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*max));
|
||||
|
||||
m_elemSizeAllocated = numNZElemToReserve;
|
||||
}
|
||||
}
|
||||
else
|
||||
NOT_IMPLEMENTED;
|
||||
|
@ -701,6 +714,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
m_format = matrixFormatSparseCSR;
|
||||
Resize(numRows, numCols, nz);
|
||||
SetNzCount(nz);
|
||||
|
||||
cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
|
||||
CUDACALL(cudaMemcpy(RowLocation(), h_CSRRow, RowSize(), kind));
|
||||
|
@ -741,6 +755,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SetComputeDeviceId(devId);
|
||||
m_format = matrixFormatSparseCSC;
|
||||
Resize(numRows, numCols, nz);
|
||||
SetNzCount(nz);
|
||||
|
||||
cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
|
||||
CUDACALL(cudaMemcpy(RowLocation(), h_Row, RowSize(), kind));
|
||||
|
@ -792,6 +807,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
m_format = matrixFormatSparseCSC;
|
||||
Resize(m_numRows, m_numCols, labelSize);
|
||||
SetNzCount(labelSize);
|
||||
|
||||
m_expandedSize = expandedSize;
|
||||
m_blockSize = blockSize;
|
||||
|
@ -1320,6 +1336,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
// now we know the number of Non-zeros in the result set, set the output size
|
||||
c.Resize(m, n, nnzC);
|
||||
c.m_nz = nnzC;
|
||||
|
||||
CUDACALL(cudaMemcpy(c.SecondaryIndexLocation(),csrRowPtrC,c.SecondaryIndexSize(),cudaMemcpyDeviceToDevice));
|
||||
|
||||
// if we allocated the buffer, free it here
|
||||
|
@ -1805,6 +1823,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
GPUSparseMatrix c(GetFormat(), GetComputeDeviceId());
|
||||
c.Resize(n, m, nnz, GetFormat());
|
||||
c.m_nz = nnz;
|
||||
|
||||
cusparseHandle_t cusparseHandle = 0;
|
||||
CUSPARSECALL(cusparseCreate(&cusparseHandle));
|
||||
|
@ -2283,6 +2302,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
NOT_IMPLEMENTED;
|
||||
|
||||
us.Resize(rownum, colnum, nz);
|
||||
us.SetNzCount(nz);
|
||||
|
||||
if (nz > 0)
|
||||
{
|
||||
|
|
|
@ -77,7 +77,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
|
||||
|
||||
GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
|
||||
size_t SecondaryIndexCount() const
|
||||
size_t SecondaryIndexCount(const size_t numNZ) const
|
||||
{
|
||||
if (m_format&matrixFormatCompressed)
|
||||
{
|
||||
|
@ -86,12 +86,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return cnt;
|
||||
}
|
||||
else
|
||||
return m_nz; // COO format
|
||||
return numNZ; // COO format
|
||||
}
|
||||
|
||||
size_t SecondaryIndexCount() const
|
||||
{
|
||||
return SecondaryIndexCount(m_nz);
|
||||
}
|
||||
|
||||
// get size for compressed index
|
||||
size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
|
||||
|
||||
size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
|
||||
size_t BufferSizeNeeded(const size_t numNZ) const
|
||||
{ return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
|
||||
|
||||
size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
|
||||
ElemType* BufferPointer() const;
|
||||
|
||||
|
@ -107,8 +116,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void SetValue(const GPUMatrix<ElemType>& denseMatrix);
|
||||
|
||||
void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZ);
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
|
||||
|
||||
GPUSparseMatrix<ElemType> Transpose() const;
|
||||
void InplaceTranspose();
|
||||
|
|
|
@ -925,7 +925,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType>
|
||||
void Matrix<ElemType>::SetValue(const size_t rIdx, const size_t cIdx, ElemType val)
|
||||
{
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
|
||||
this,
|
||||
(*m_CPUMatrix)(rIdx, cIdx) = val,
|
||||
NOT_IMPLEMENTED,
|
||||
|
@ -1150,26 +1150,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly /*=true*/)
|
||||
void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve /*=0*/, bool growOnly /*=true*/)
|
||||
{
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->Resize(numRows,numCols,growOnly),
|
||||
m_GPUMatrix->Resize(numRows,numCols,growOnly),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED
|
||||
);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t allocatedSize)
|
||||
{
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED,
|
||||
m_CPUSparseMatrix->Resize(numRows,numCols, allocatedSize),
|
||||
m_GPUSparseMatrix->Resize(numRows,numCols, allocatedSize)
|
||||
m_CPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly),
|
||||
m_GPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -3069,11 +3057,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
delete m_CPUSparseMatrix;
|
||||
m_CPUSparseMatrix = NULL;
|
||||
SetDataLocation(GPU, DENSE);
|
||||
SetDataLocation(GPU, SPARSE);
|
||||
}
|
||||
else
|
||||
{
|
||||
SetDataLocation(BOTH, DENSE);
|
||||
SetDataLocation(BOTH, SPARSE);
|
||||
}
|
||||
}
|
||||
else //from GPU
|
||||
|
|
|
@ -112,8 +112,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
|
||||
|
||||
void Reshape(const size_t numRows, const size_t numCols);
|
||||
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); //by default we only reallocate if need to grow
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t allocatedSize); //for sparse matrix
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true); //by default we only reallocate if need to grow
|
||||
size_t GetAllocatedSize() const;
|
||||
void Reset(); //reset for sparse matrix
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly = true) {}//matrix format will affect the size to allocate
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ) {}
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const bool growOnly = true) {}
|
||||
|
||||
template<class ElemType> GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix() const
|
||||
{
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Двоичные данные
papers/CNTK-TechReport/lyx/CNTKBook-Draft0.3-2014-11-24.pdf → papers/CNTK-TechReport/lyx/CNTKBook-Draft0.4-2015-01-04.pdf
Executable file → Normal file
Двоичные данные
papers/CNTK-TechReport/lyx/CNTKBook-Draft0.3-2014-11-24.pdf → papers/CNTK-TechReport/lyx/CNTKBook-Draft0.4-2015-01-04.pdf
Executable file → Normal file
Двоичный файл не отображается.
|
@ -115,8 +115,8 @@ Jie Gao, Avner May, Baolin Peng, Andreas Stolcke, Malcolm Slaney
|
|||
\end_layout
|
||||
|
||||
\begin_layout Date
|
||||
MSR-TR-2014-112 (DRAFT v0.3: Nov.
|
||||
23, 2014)
|
||||
MSR-TR-2014-112 (DRAFT v0.4: Jan.
|
||||
4, 2015)
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
|
|
|
@ -3265,7 +3265,7 @@ status open
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
Delay(m, [delayTime=1, defaultPastValue=0.1])
|
||||
Delay(rows, [cols], m, [delayTime=1, defaultPastValue=0.1])
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
@ -3273,6 +3273,18 @@ Delay(m, [delayTime=1, defaultPastValue=0.1])
|
|||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
rows - the number of rows in the delay node (and in the input matrix).
|
||||
This parameter is needed because under some loopy conditions the dimensions
|
||||
cannot be automatically inferred from the input matrix.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
cols - the number of columns in the delay node (and in the input matrix).
|
||||
This parameter is optional since it will be set based on the minibatch
|
||||
size during training and testing.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
m - input matrix to be delayed.
|
||||
Each column is a sample.
|
||||
|
|
|
@ -1638,8 +1638,21 @@ loadBestModel
|
|||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
learnRateAdjustInterval: determine the frequency of applying the learning
|
||||
rate adjustment check.
|
||||
|
||||
\emph on
|
||||
learnRateAdjustInterval
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
learnRateAdjustInterval
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\emph default
|
||||
: determine the frequency of applying the learning rate adjustment check.
|
||||
Default is 1 epoch.
|
||||
If this value is set to a value larger than 1 the learning rate adjustment
|
||||
will be based on the average criterion computed from the last learnRateAdjustIn
|
||||
|
@ -1776,9 +1789,113 @@ gradUpdateType
|
|||
: gradient update type.
|
||||
Valid values are None (default, no special treatment to the gradient),
|
||||
AdaGrad, and RmsProp.
|
||||
When gradUpdateType equals to RmsProp, you can control the behavior of
|
||||
the gradient update using following parameters:
|
||||
\end_layout
|
||||
|
||||
\begin_deeper
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
rms_wgt_inc
|
||||
\emph default
|
||||
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
rms_wgt_inc
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
: multiplicative increment of the learning rate scale.
|
||||
Default is 1.2.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
rms_wgt_dec
|
||||
\emph default
|
||||
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
rms_wgt_dec
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
: multiplicative decrement of the learning rate scale.
|
||||
Default is 0.75.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
rms_wgt_max
|
||||
\emph default
|
||||
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
rms_wgt_max
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
: maximum learning rate scale allowed.
|
||||
A value closer to 1 makes the learning rate adjustment more stable but
|
||||
slower.
|
||||
Default is 10.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
rms_wgt_min
|
||||
\emph default
|
||||
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
rms_wgt_min
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
: minimum learning rate scale allowed.
|
||||
A value closer to 1 makes the learning rate adjustment more stable but
|
||||
slower.
|
||||
Default is 0.1.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
rms_gamma
|
||||
\emph default
|
||||
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
rms_gamma
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
: smoothing factor used to estimate the moving average of the variance.
|
||||
The smaller the value, the quicker it forgets the past information.
|
||||
Default is 0.99.
|
||||
|
||||
\end_layout
|
||||
|
||||
\end_deeper
|
||||
\begin_layout Itemize
|
||||
|
||||
\emph on
|
||||
|
@ -4366,6 +4483,60 @@ minibatchSize
|
|||
– the minibatch size to use when creating the label mapping file.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
ConvertDBN Command
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
ConvertDBN Command
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
This command is used to convert a model generated by Microsoft's dbn.exe
|
||||
tool to a CNTK model.
|
||||
This command is useful when you want to compare the performance of these
|
||||
two tools (dbn.exe only supports simple fully connected deep neural networks),
|
||||
port existing models trained with dbn.exe to CNTK, or if you want to use
|
||||
the RBM pre-training which is available in dbn.exe but not in CNTK right
|
||||
now.
|
||||
The related parameters are
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
modelPath
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
modelPath
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
– the full path of the generated CNTK model.
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
dbnModelPath
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
dbnModelPath
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
– the full path of the model to be converted.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
Additional Top-Level Configurations
|
||||
\end_layout
|
||||
|
|
|
@ -116,10 +116,10 @@ At the center of the CNTK is the ComputationNetwork class, which manages
|
|||
the life span of computation nodes comprising the network and all the functions
|
||||
operating at the network level such as forward computations and gradient
|
||||
calculations.
|
||||
To build a computational network you need to use one of the ComputationNetBuild
|
||||
er classes that implement the IComputationNetBuilder interface.
|
||||
To build a computational network you need to use one of the computational
|
||||
network builder classes that implement the IComputationNetBuilder interface.
|
||||
These classes include SimpleNetworkBuilder that supports building simple
|
||||
layer-by-layer fully connected networks,
|
||||
layer-by-layer fully connected networks and
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
|
@ -149,8 +149,7 @@ LSTM
|
|||
|
||||
\end_inset
|
||||
|
||||
) neural networks.
|
||||
It also includes NDLNetworkBuilder that can build neural network, using
|
||||
) RNNs, as well as NDLNetworkBuilder that builds neural networks, using
|
||||
any computation node we have described in Section
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
|
@ -181,7 +180,7 @@ IDataReader
|
|||
\end_inset
|
||||
|
||||
is an interface for loading data and its transcriptions.
|
||||
Different data file format requires different data readers.
|
||||
Different data file formats require different data readers.
|
||||
CNTK already implements the UCIFastReader and the BinaryReader that reads
|
||||
in UCI data in either text or binary format, the HTKMLFReader that reads
|
||||
in HTK/MLF speech data, the SequenceReader that is designed for language
|
||||
|
|
|
@ -159,7 +159,7 @@ key "Variable-Component-Deep-Neural-Network:2014"
|
|||
Conventionally, one needs to design the network, derive the derivatives
|
||||
needed to optimize the network, implement the algorithm, and then run the
|
||||
experiments.
|
||||
These steps are error pronoe and time consuming.
|
||||
These steps are error prone and time consuming.
|
||||
With CNTK, however in many cases, you only need to write a simple configuration
|
||||
file.
|
||||
The rest of this chapter describes the configuration file needed to implement
|
||||
|
@ -819,7 +819,7 @@ status open
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
cn.exe config=Simple.config
|
||||
cn.exe configFile=Simple.config
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
@ -838,7 +838,7 @@ status open
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
cn config=Simple.config
|
||||
cn configFile=Simple.config
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
|
Загрузка…
Ссылка в новой задаче