Prepend timestamp to log lines when tracing flag is turned on

This commit is contained in:
Chris Basoglu 2016-03-25 17:31:02 -07:00
Родитель 5a574305b0
Коммит 04abf66d78
217 изменённых файлов: 16989 добавлений и 16919 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -70,7 +70,7 @@ void TestCn(const ConfigParameters& config);
void RedirectStdErr(wstring logpath)
{
fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
LOGPRINTF(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
if (dup2(fileno(*f), 2) == -1)
{
@ -165,7 +165,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
if (numCPUThreads > 0)
{
std::cerr << "Using " << numCPUThreads << " CPU threads." << endl;
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
}
bool progressTracing = config(L"progressTracing", false);
@ -187,14 +187,14 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
if (action[j] == "train" || action[j] == "trainRNN")
{
wstring modelPath = commandParams("modelPath");
std::wcerr << "CNTKModelPath: " << modelPath << endl;
LOGPRINTF(stderr, "CNTKModelPath: %ls\n", modelPath.c_str());
size_t maxEpochs = GetMaxEpochs(commandParams);
std::cerr << "CNTKCommandTrainInfo: " + command[i] << " : " << maxEpochs << endl;
LOGPRINTF(stderr, "CNTKCommandTrainInfo: %s : %d\n", command[i].c_str(), (int) maxEpochs);
fullTotalMaxEpochs += maxEpochs;
}
}
}
std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;
LOGPRINTF(stderr, "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : %d\n", (int) fullTotalMaxEpochs);
// set up progress tracing for compute cluster management
if (progressTracing && (!mpi || mpi->IsMainNode()))
@ -225,19 +225,20 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
// print a banner to visually separate each action in the log
const char* delim = "##############################################################################";
const char* prefix = "Action ";
fprintf(stderr, "\n%s\n", delim);
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
fprintf(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
fprintf(stderr, "%s\n\n", delim);
fprintf(stderr, "\n");
LOGPRINTF(stderr, "%s\n", delim);
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
LOGPRINTF(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
LOGPRINTF(stderr, "%s\n\n", delim);
if ((mpi == nullptr) || (commandstoRunOnAllRanks.find(thisAction) != commandstoRunOnAllRanks.end()) || mpi->IsMainNode())
{
if (thisAction == "train" || thisAction == "trainRNN")
{
std::cerr << "CNTKCommandTrainBegin: " + command[i] << endl;
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
DoTrain<ConfigParameters, ElemType>(commandParams);
std::cerr << "CNTKCommandTrainEnd: " + command[i] << endl;
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
fullEpochsOffset += GetMaxEpochs(commandParams);
}
else if (thisAction == "adapt")
@ -298,7 +299,8 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
}
}
fprintf(stderr, "\nAction \"%s\" complete.\n\n", thisAction.c_str());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
NDLScript<ElemType> ndlScript;
ndlScript.ClearGlobal(); // clear global macros between commands
@ -321,51 +323,51 @@ std::string TimeDateStamp()
void PrintBuiltInfo()
{
fprintf(stderr, "-------------------------------------------------------------------\n");
fprintf(stderr, "Build info: \n\n");
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Build info: \n\n");
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
#ifdef _BUILDTYPE_
fprintf(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
#endif
#ifdef _BUILDTARGET_
fprintf(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
#endif
#ifdef _WITH_1BITSGD_
fprintf(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
#endif
#ifdef _MATHLIB_
fprintf(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
#endif
#ifdef _CUDA_PATH_
fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
#endif
#ifdef _CUB_PATH_
fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
#endif
#ifdef _CUDNN_PATH_
fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
#endif
#ifdef _GIT_EXIST
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
#endif
#ifdef _BUILDER_
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
#endif
#ifdef _BUILDPATH_
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
#endif
fprintf(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
void PrintUsageInfo()
{
fprintf(stderr, "-------------------------------------------------------------------\n");
fprintf(stderr, "Usage: cntk configFile=yourConfigFile\n");
fprintf(stderr, "For detailed information please consult the CNTK book\n");
fprintf(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
fprintf(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Usage: cntk configFile=yourConfigFile\n");
LOGPRINTF(stderr, "For detailed information please consult the CNTK book\n");
LOGPRINTF(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
// ---------------------------------------------------------------------------
@ -412,9 +414,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
wstring startupMessage = msra::strfun::wstrprintf(L"running on %ls at %ls\n", msra::strfun::utf16(GetHostName()).c_str(), msra::strfun::utf16(TimeDateStamp()).c_str());
startupMessage += msra::strfun::wstrprintf(L"command line: %ls", exePath.c_str());
for (const auto& arg : args)
{
startupMessage += L" " + arg;
}
fprintf(stderr, "%ls\n", startupMessage.c_str());
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
// parse command-line options
vector<wstring> sourceFiles;
@ -443,6 +447,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// compile the BrainScript
wstring bs = L"[\n";
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
for (const auto& sourceFile : sourceFiles)
@ -451,7 +456,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
for (const auto& over : overrides)
bs += L"with [ " + over + L" ]\n";
fprintf(stderr, "\n\nBrainScript -->\n\n%ls\n\n", bs.c_str());
fprintf(stderr, "\n\n");
LOGPRINTF(stderr, "BrainScript -->\n\n%ls\n\n", bs.c_str());
let expr = BS::ParseConfigExpression(bs, move(includePaths)); // parse
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
@ -486,7 +492,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
logpath += msra::strfun::wstrprintf(L"rank%d", (int) mpi->CurrentNodeRank());
RedirectStdErr(logpath);
fprintf(stderr, "%ls\n", startupMessage.c_str());
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
}
// echo config info to log
@ -497,10 +503,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
int numCPUThreads = config(L"numCPUThreads", 0);
numCPUThreads = CPUMatrix<float /*any will do*/>::SetNumThreads(numCPUThreads);
if (numCPUThreads > 0)
fprintf(stderr, "Using %d CPU threads.\n", numCPUThreads);
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
bool progressTracing = config(L"progressTracing", false);
size_t fullTotalMaxEpochs = 1; // BUGBUG: BS does not allow me to read out the max epochs parameters, as that would instantiate and thus execute the objects
// set up progress tracing for compute cluster management
if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
@ -532,7 +539,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
fcloseOrDie(fp);
}
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
LOGPRINTF(stderr, "__COMPLETED__\n");
fflush(stderr);
MPIWrapper::DeleteInstance();
return EXIT_SUCCESS;
@ -585,36 +593,51 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
PrintBuiltInfo(); // this one goes to log file
std::string timestamp = TimeDateStamp();
bool timestamping = config(L"timestamping", false);
if (timestamping)
{
ProgressTracing::SetTimestampingFlag();
}
// dump config info
fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
fprintf(stderr, "Command line: \n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
LOGPRINTF(stderr, "Command line: \n");
for (int i = 0; i < argc; i++)
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
{
// use 2 spaces for better visual separability
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]);
}
fprintf(stderr, "\n\n");
#if 1 //def _DEBUG
// This simply merges all the different config parameters specified (eg, via config files or via command line directly),
// and prints it.
fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "%s\n", rawConfigString.c_str());
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
fprintf(stderr, "\n\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line),
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overridden at command line),
// All of these assignments will appear, even though only the last assignment matters.
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
// This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
// value it is set to will appear).
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
config.dumpWithResolvedVariables();
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
#endif
fprintf(stderr, "Commands:");
LOGPRINTF(stderr, "Commands:");
for (int i = 0; i < command.size(); i++)
{
fprintf(stderr, " %s", command[i].c_str());
}
fprintf(stderr, "\n");
// run commands
@ -623,7 +646,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
if (config.Exists("type"))
InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");
fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
LOGPRINTF(stderr, "Precision = \"%s\"\n", type.c_str());
if (type == "float")
DoCommands<float>(config, mpi);
else if (type == "double")
@ -638,7 +662,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
fcloseOrDie(fp);
}
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
LOGPRINTF(stderr, "__COMPLETED__\n");
fflush(stderr);
MPIWrapper::DeleteInstance();
return EXIT_SUCCESS;
@ -664,38 +689,45 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
if (argc <= 1)
{
fprintf(stderr, "No command-line argument given.\n");
LOGPRINTF(stderr, "No command-line argument given.\n");
PrintUsageInfo();
return EXIT_FAILURE;
}
// detect legacy CNTK configuration
bool isOldCNTKConfig = false;
for (int i = 0; i < argc && !isOldCNTKConfig; i++)
isOldCNTKConfig |= !_wcsnicmp(L"configFile=", argv[i], 11);
if (isOldCNTKConfig)
return wmainOldCNTKConfig(argc, argv);
// run from BrainScript
return wmainWithBS(argc, argv);
}
catch (const ScriptableObjects::ScriptingException& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
err.PrintError();
return EXIT_FAILURE;
}
catch (const IExceptionWithCallStackBase& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
return EXIT_FAILURE;
}
catch (const std::exception& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
return EXIT_FAILURE;
}
catch (...)
{
fprintf(stderr, "\nUnknown ERROR occurred\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Unknown ERROR occurred\n");
return EXIT_FAILURE;
}
}
@ -703,7 +735,8 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
#ifdef __WINDOWS__
void TerminateThis()
{
fprintf(stderr, "terminate_this: aborting\n"), fflush(stderr);
LOGPRINTF(stderr, "terminate_this: aborting\n");
fflush(stderr);
exit(EXIT_FAILURE);
}
@ -714,7 +747,7 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
if (pExcPointers->ExceptionRecord->ExceptionCode == EXCEPTION_DLL_NOT_FOUND)
{
const auto & pDelayLoadInfo = *PDelayLoadInfo(pExcPointers->ExceptionRecord->ExceptionInformation[0]);
fprintf(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
LOGPRINTF(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
}
}
@ -736,7 +769,7 @@ int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 excepti
else if (code == EXCEPTION_INT_DIVIDE_BY_ZERO) msg = ": Integer division by zero";
else if (code == EXCEPTION_STACK_OVERFLOW) msg = ": Stack overflow";
else if (code == EXCEPTION_DLL_NOT_FOUND) msg = ": Module not found";
fprintf(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
LOGPRINTF(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
fflush(stderr);
exit(EXIT_FAILURE);
}

Просмотреть файл

@ -4,10 +4,32 @@
//
#pragma once
#include <chrono>
#include "TimerUtility.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// If the Tracing flag is set, print out a timestamp with no new line at the end
#define PREPENDTS(stream) \
do \
{ \
if (ProgressTracing::GetTimestampingFlag()) \
{ \
std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
char mbstr[30]; \
if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
fprintf(stream, "%s: ", mbstr); \
} \
} while(0)
// Print out a log message. If the Tracing flag is set, prepend with a timestamp
#define LOGPRINTF(stream, ...) \
do \
{ \
PREPENDTS(stream); \
fprintf(stream, __VA_ARGS__); \
} while(0)
// ---------------------------------------------------------------------------
// ProgressTracing -- static helper class for logging a progress indicator
//
@ -29,12 +51,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
bool m_enabled;
bool m_tracingFlag;
bool m_timestampFlag;
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
size_t m_currentStepOffset; // current offset
Timer m_progressTracingTimer;
ProgressTracing()
: m_enabled(false), m_tracingFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
: m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
{
}
@ -50,12 +73,23 @@ public:
return GetStaticInstance().m_tracingFlag;
}
static bool GetTimestampingFlag()
{
return GetStaticInstance().m_timestampFlag;
}
static void SetTracingFlag()
{
auto& us = GetStaticInstance();
us.m_tracingFlag = true;
}
static void SetTimestampingFlag()
{
auto& us = GetStaticInstance();
us.m_timestampFlag = true;
}
// call TraceTotalNumberOfSteps() to set the total number of steps
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)

Просмотреть файл

@ -609,11 +609,6 @@ void renameOrDie(const std::string& from, const std::string& to)
// WORKAROUND: "rename" should do this but this is a workaround
// to the HDFS FUSE implementation's bug of failing to do so
// workaround for FUSE rename when running on Philly
if (ProgressTracing::GetTracingFlag())
{
fprintf(stderr, "rename %s to %s\n", from.c_str(), to.c_str());
}
unlinkOrDie(to);
if (rename(from.c_str(), to.c_str()) != 0)
{

Просмотреть файл

@ -94,7 +94,9 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
{
// assert(node->Input(i)->m_indexInLoop == 0); // No. It seems this variable really counts the number of parents.
node->Input(i)->m_indexInLoop++; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
// BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
node->Input(i)->m_indexInLoop++;
}
}
}

Просмотреть файл

@ -114,9 +114,11 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
{
// instead of the node itself, include the sentinel SEQTraversalFlowControlNode in our list
m_nestedNodes.push_back(recInfo);
// and verify that we only encountered the loop once (all nodes should have been consecutive)
if (!loopsSeen.insert(recInfo).second)
LogicError("PARTraversalFlowControlNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
// consume all nodes that are part of the same loop (they are all consecutive)
while (nodeIter != allNodes.end() && (*nodeIter)->IsPartOfLoop() && FindInRecurrentLoops(recurrentInfo, *nodeIter) == recInfo)
nodeIter++;
@ -303,8 +305,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
// look in all recurrent loops of the network
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
for (auto& iter : recurrentInfo)
{
if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of SEQTraversalFlowControlNode?
return iter;
}
return nullptr; // not part of a recurrent loop
}
@ -357,8 +361,10 @@ void ComputationNetwork::PrintComputationTree(const ComputationNodeBasePtr& root
if (nodes.size() == 0)
fprintf(stderr, "\n(empty)\n");
else
{
for (const auto& node : nodes)
node->PrintSelf(printMatrices);
}
}
// -----------------------------------------------------------------------
@ -397,9 +403,11 @@ void ComputationNetwork::CompileNetwork()
// all steps below have to be repeated for all root nodes (=nodes without parents and PreComputeNodes)
DetermineSetOfAllRoots();
fprintf(stderr, "\n%d roots:\n", (int) m_allRoots.size());
fprintf(stderr, "\n%d roots:\n", (int)m_allRoots.size());
for (const auto& root : m_allRoots)
{
fprintf(stderr, "\t%ls = %ls\n", root->NodeName().c_str(), root->OperationName().c_str());
}
// Note: Steps below are loops over root nodes. We will gradually push those loops through to the functions,
// to reduce redundant operation on shared portions of the network.
@ -473,10 +481,13 @@ void ComputationNetwork::DetermineSetOfAllRoots()
set<ComputationNodeBasePtr> allKnownRoots;
for (const auto& node : FinalCriterionNodes())
allKnownRoots.insert(node);
for (const auto& node : EvaluationNodes())
allKnownRoots.insert(node);
for (const auto& node : OutputNodes())
allKnownRoots.insert(node);
for (const auto& iter : m_nameToNodeMap) // PreComputeNodes
{
auto node = iter.second;
@ -513,7 +524,9 @@ void ComputationNetwork::ValidateNetwork()
// set up MBLayout links of inputs (all others get propagated upwards through Validate())
// TODO: Once we support mismatching layouts, this will be more involved. For now, everything shares the one layout that the Network knows about.
for (auto node : InputNodes(nullptr))
{
node->LinkToMBLayout(m_pMBLayout);
}
// we call all nodes' Validate() in order to validate, that is, set up MBLayout and FunctionValues dimension
// A problem is that recurrent loops may require partial validation.
@ -542,6 +555,7 @@ void ComputationNetwork::ValidateNetwork()
}
fprintf(stderr, "\nValidating network, final pass.\n\n");
toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/);
if (toValidate != 0)
LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");
@ -571,7 +585,7 @@ void ComputationNetwork::ValidateNetwork()
}
if (!nonDefaultNodes.empty())
{
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int) nonDefaultNodes.size(), (int) nodes.size());
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int)nonDefaultNodes.size(), (int)nodes.size());
// for (auto node : nonDefaultNodes)
// fprintf(stderr, " %ls\n", node->NodeName().c_str());
// fprintf(stderr, "\n\n");
@ -631,6 +645,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
allChildrenVisited &= child->m_visited;
}
// if there is not at least one visited child
bool valid = false;
if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
@ -652,8 +667,10 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
node->m_visited = true;
// print the new type
// sanity checks
if (isFinalValidationPass && !unchanged)
LogicError("ValidateSubNetwork: %ls %ls operation changed during final validation.", node->NodeName().c_str(), node->OperationName().c_str());
if (isFinalValidationPass && !allChildrenVisited)
LogicError("ValidateSubNetwork: %ls %ls operation in final validation although not all children were visited?", node->NodeName().c_str(), node->OperationName().c_str());
// if all children valid then
@ -830,7 +847,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
else
{
nodeIter->RequestMatricesBeforeForwardProp(m_matrixPool);
// we only release matrices for the children since the root node's informatioin will be used and should not be shared
// we only release matrices for the children since the root node's information will be used and should not be shared
// with others
ReleaseMatricesAfterEvalForChildren(nodeIter, parentCount);
}

Просмотреть файл

@ -44,22 +44,23 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
int startEpoch = DetermineStartEpoch(makeMode);
if (startEpoch == m_maxEpochs)
{
fprintf(stderr, "No further training is necessary.\n");
LOGPRINTF(stderr, "No further training is necessary.\n");
return;
}
wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
bool loadNetworkFromCheckpoint = startEpoch >= 0;
fprintf(stderr, "\n");
if (loadNetworkFromCheckpoint)
fprintf(stderr, "\nStarting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
else
fprintf(stderr, "\nCreating virgin network.\n");
LOGPRINTF(stderr, "Creating virgin network.\n");
// create or load from checkpoint
shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
// log the device we are computing on
fprintf(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
LOGPRINTF(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
if (net->GetDeviceId() < 0)
fprintf(stderr, " on CPU.\n");
else
@ -74,6 +75,7 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
// set tracing flags
for (const auto& traceNodeName : m_traceNodeNamesReal)
net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/false);
for (const auto& traceNodeName : m_traceNodeNamesCategory)
net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/true);
@ -93,7 +95,7 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
int startEpoch = DetermineStartEpoch(makeMode);
if (startEpoch == m_maxEpochs)
{
fprintf(stderr, "No further training is necessary.\n");
LOGPRINTF(stderr, "No further training is necessary.\n");
return;
}
@ -102,13 +104,13 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
if (startEpoch >= 0)
{
wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
fprintf(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
networkLoadedFromCheckpoint = true;
}
else
{
fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
LOGPRINTF(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
}
@ -118,14 +120,14 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
if (m_needAdaptRegularization)
{
fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
LOGPRINTF(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
refNet = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
}
ComputationNodeBasePtr refNode;
if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
{
fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
LOGPRINTF(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
if (refNodeName == L"")
InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
refNode = refNet->GetNodeFromName(refNodeName);
@ -152,9 +154,12 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
auto& labelNodes = net->LabelNodes();
auto& criterionNodes = GetTrainCriterionNodes(net);
fprintf(stderr, "\nTraining criterion node(s):\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Training criterion node(s):\n");
for (const auto& node : criterionNodes)
fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
{
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
}
// determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
std::vector<ComputationNodeBasePtr> evaluationNodes;
@ -170,9 +175,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (!evaluationNodes.empty())
{
fprintf(stderr, "\nEvaluation criterion node(s):\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Evaluation criterion node(s):\n");
fprintf(stderr, "\n");
for (const auto& node : evaluationNodes)
fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
{
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
}
}
}
@ -389,8 +398,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (learnRatePerSample < m_minLearnRate)
{
fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
i + 1, learnRatePerSample, m_minLearnRate);
LOGPRINTF(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
i + 1, learnRatePerSample, m_minLearnRate);
if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
{
// In case of parallel training only the main node should we saving the model to prevent
@ -440,8 +449,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
: momentumPerSample >= 1.0 ? 0.0
: -1.0 / log(momentumPerSample);
fprintf(stderr, "\nStarting Epoch %d: learning rate per sample = %f effective momentum = %f momentum as time constant = %.1f samples\n",
i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Starting Epoch %d: learning rate per sample = %f effective momentum = %f momentum as time constant = %.1f samples\n",
i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
TrainOneEpoch(net,
refNet,
@ -473,9 +483,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
lrControlCriterion = epochCriterion;
}
fprintf(stderr,
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; TotalSamplesSeen = %d; ",
i + 1, (int)m_maxEpochs, epochCriterion, (int)totalSamplesSeen);
LOGPRINTF(stderr,
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; TotalSamplesSeen = %d; ",
i + 1, (int)m_maxEpochs, epochCriterion, (int)totalSamplesSeen);
m_lastFinishedEpochTrainLoss = epochCriterion;
if (epochEvalErrors.size() == 0) // no eval criterion, only train criterion itself
{
@ -501,13 +511,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
learnRatePerSample, epochTime);
// TODO: why these extra log messages here and not for 1 eval criterion?
fprintf(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
for (size_t j = 0; j < epochEvalErrors.size(); j++)
{
fprintf(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
}
}
@ -526,7 +536,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// BUGBUG: We should not use the training MB size. The training MB size is constrained by both convergence and memory. Eval is only constrained by memory.
vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
if (vScore.size() > 1)
{
fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
@ -575,7 +585,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (m_loadBestModel)
{
auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
LOGPRINTF(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
net->RereadPersistableParameters<ElemType>(bestModelPath);
LoadCheckPointInfo(i - m_learnRateAdjustInterval,
/*out*/ totalSamplesSeen,
@ -604,7 +614,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
net->Save(GetModelNameForEpoch(i, true));
fprintf(stderr, "Finished training and saved final model\n\n");
LOGPRINTF(stderr, "Finished training and saved final model\n\n");
break;
}
}
@ -612,7 +622,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (learnRateReduced)
{
learnRatePerSample *= m_learnRateDecreaseFactor;
fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
LOGPRINTF(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
}
}
else
@ -623,13 +633,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
{
learnRatePerSample *= m_learnRateDecreaseFactor;
fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
LOGPRINTF(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
}
else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
prevCriterion != std::numeric_limits<double>::infinity())
{
learnRatePerSample *= m_learnRateIncreaseFactor;
fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
LOGPRINTF(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
}
}
}
@ -659,7 +669,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
{
SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
auto modelName = GetModelNameForEpoch(i);
fprintf(stderr, "SGD: Saving checkpoint model '%ls'\n", modelName.c_str());
LOGPRINTF(stderr, "SGD: Saving checkpoint model '%ls'\n", modelName.c_str());
net->Save(modelName);
if (!m_keepCheckPointFiles)
{
@ -684,8 +694,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (learnRatePerSample < 1e-12)
{
fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
learnRatePerSample);
LOGPRINTF(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
learnRatePerSample);
}
}
// --- END OF MAIN EPOCH LOOP
@ -812,6 +822,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded<ElemType>(trainSetDataReader, m_maxSamplesInRAM, m_numSubminiBatches, tunedMBSize);
// this is non-trivial, we need a manager object to handle this
if (numSubminibatchesNeeded > 1)
smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
@ -824,26 +835,30 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// TODO: move the two-forward-pass support out of the reader, make a first-class citizen.
AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
fprintf(stderr, "\nStarting minibatch loop");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Starting minibatch loop");
if (useGradientAggregation)
{
fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits);
if (m_bufferedAsyncGradientAggregation)
{
fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
}
}
if (useDistributedMBReading)
{
fprintf(stderr, ", distributed reading is ENABLED");
}
if (numSubminibatchesNeeded > 1)
{
if (m_maxSamplesInRAM < SIZE_MAX)
fprintf(stderr, ", with maximum %d samples in RAM", (int) m_maxSamplesInRAM);
fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
else
fprintf(stderr, ", with %d subminibatch", (int) numSubminibatchesNeeded);
fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
}
fprintf(stderr, ".\n");
@ -1103,7 +1118,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// progress tracing for regular log
string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d, %2." + std::to_string(mbProgNumPrecision) + "f%%]: SamplesSeen = %d; TrainLossPerSample = " +
GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
SGDTrace(stderr, formatString.c_str(),
SGDTrace(stderr, true, formatString.c_str(),
prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
numMBsRun, mbProg * 100, numSamplesLastMBs, trainLossPerSample);
}
@ -1113,7 +1128,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d]: SamplesSeen = %d; TrainLossPerSample = " +
GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
SGDTrace(stderr, formatString.c_str(),
SGDTrace(stderr, true, formatString.c_str(),
prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
numMBsRun, numSamplesLastMBs, trainLossPerSample);
m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult;
@ -1124,11 +1139,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
{
evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
SGDTrace(stderr, formatString.c_str(), i, evalError);
SGDTrace(stderr, false, formatString.c_str(), i, evalError);
}
string formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 4, totalTimeInMBs) + "s; SamplesPerSecond = %.1f\n";
SGDTrace(stderr, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
SGDTrace(stderr, false, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
// progress tracing for compute cluster management
if (wasProgressPrinted)
@ -1287,13 +1302,16 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
if (nodes.size() == 0)
{
fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step.\n");
LOGPRINTF(stderr, "No PreCompute nodes found, skipping PreCompute step.\n");
return false;
}
fprintf(stderr, "\nPrecomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Precomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
for (const auto & node : nodes)
fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
{
LOGPRINTF(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
}
// compute
ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::preComputing);
@ -1328,9 +1346,12 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
// finalize
for (auto & node : nodes)
{
dynamic_pointer_cast<IPreComputeNode>(node)->MarkComputed(true /*done accumulating*/);
}
fprintf(stderr, "\nPrecomputing --> Completed.\n\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Precomputing --> Completed.\n\n");
return true;
}
@ -1490,8 +1511,8 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample : rightLearnRatePerSample;
}
fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n",
epochNumber + 1, bestLearnRatePerSample, baseCriterion);
LOGPRINTF(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n",
epochNumber + 1, bestLearnRatePerSample, baseCriterion);
return bestLearnRatePerSample;
}
@ -1542,8 +1563,8 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
{
// newly started training: any previous MB size stored in the model is to be ignored
fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
"considered invalid -> resetting\n",
LOGPRINTF(stderr, "before epoch .2, previous minibatchSize %zd is "
"considered invalid -> resetting\n",
m_prevChosenMinibatchSize);
m_prevChosenMinibatchSize = 0;
}
@ -1553,9 +1574,9 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
(epochNumber + 1) > m_minibatchSizeTuningFrequency &&
(epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
{
fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
"in epoch %d skipped, keeping minibatchSize of %zd\n",
epochNumber + 1, m_prevChosenMinibatchSize);
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
"in epoch %d skipped, keeping minibatchSize of %zd\n",
epochNumber + 1, m_prevChosenMinibatchSize);
chosenMinibatchSize = m_prevChosenMinibatchSize;
}
else
@ -1565,9 +1586,9 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
// if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
// is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
// then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
"largest of previous minibatchSize = (%d / 2) or %d\n",
(int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
"largest of previous minibatchSize = (%d / 2) or %d\n",
(int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
}
@ -1578,8 +1599,8 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
{
assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
"previous minibatchSize %zd*2\n",
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
"previous minibatchSize %zd*2\n",
m_prevChosenMinibatchSize);
maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
}
@ -1647,8 +1668,9 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
// round mbsize to something meaningful
trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
fprintf(stderr, "\n");
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
size_t totalSamplesSeen;
std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
@ -1675,7 +1697,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
lastTriedTrialEpochCriterion = baseCriterion;
isFirstIteration = false;
fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
}
else if (!std::isnan(epochCriterion) &&
(epochCriterion > (baseCriterion * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0)))))
@ -1692,15 +1714,15 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
lastTriedTrialEpochCriterion = epochCriterion;
if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
{
fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
epochCriterion, baseCriterion);
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Keep searching... "
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
epochCriterion, baseCriterion);
}
}
}
fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
(int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
(int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
return lastTriedTrialMinibatchSize;
}
@ -1732,18 +1754,18 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
/*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
prefixMsg);
fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
LOGPRINTF(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
if (epochEvalErrors.size() == 1)
fprintf(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
LOGPRINTF(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
else
{
fprintf(stderr, "EvalErrPerSample ");
LOGPRINTF(stderr, "EvalErrPerSample ");
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
LOGPRINTF(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
}
fprintf(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
LOGPRINTF(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
}
int baseModelEpoch = epochNumber - 1;
@ -1813,13 +1835,18 @@ static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double
}
template <class ElemType>
int SGD<ElemType>::SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...)
int SGD<ElemType>::SGDTrace(FILE* __restrict __stream, bool isPrependTimestamp, const char* __restrict __format, ...)
{
int result = 0;
if (m_traceLevel > 0)
{
va_list args;
va_start(args, __format);
if (isPrependTimestamp)
{
PREPENDTS(__stream);
}
result = vfprintf(__stream, __format, args);
va_end(args);
}
@ -1886,10 +1913,10 @@ template <class ElemType>
// we use simple linear (instead of log linear) scaling here
const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
#if DUMPOUTPUT
fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
learnRatePerSample, momentum, actualMBSize);
fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
learnRatePerSample, momentum, actualMBSize);
LOGPRINTF(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
gradientValues.Print("Gradient Input");
smoothedGradient.Print("Smoothed Gradient Input");
#endif
@ -1976,7 +2003,7 @@ void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr& node,
const bool useNesterovMomentum) const
{
#if DUMPOUTPUT
fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
LOGPRINTF(stderr, "Update_%ls\n", node->NodeName().c_str());
#endif
if (!node->IsParameterUpdateRequired())
LogicError("UpdateWeights() called for a learnable ComputationNode which has m_learningRateMultiplier == 0!");
@ -2072,7 +2099,7 @@ bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
if (!fexists(checkPointFileName.c_str()))
{
fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
LOGPRINTF(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
return false;
}
@ -2167,7 +2194,7 @@ int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
}
}
if (firstEpoch == m_maxEpochs)
fprintf(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
LOGPRINTF(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
return firstEpoch;
}
@ -2201,7 +2228,8 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
irow = max(0, irow);
icol = max(0, icol);
fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "###### d%ls######\n", node->NodeName().c_str());
double eOrg = node->Value()(irow, icol);
node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
@ -2259,8 +2287,9 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
bool wrong = (std::isnan(diff) || diff > threshold);
if (wrong)
{
fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
node->NodeName().c_str(), eGradNum, eGradErr);
fprintf(stderr, "\n");
LOGPRINTF(stderr, "d%ls Numeric gradient = %e, Error BP gradient = %e\n",
node->NodeName().c_str(), eGradNum, eGradErr);
sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
node->NodeName().c_str(), eGradNum, eGradErr);
errMsgs.push_back(wstrtmp);

Просмотреть файл

@ -537,7 +537,7 @@ protected:
shared_ptr<IMASGD<ElemType>> m_pMASGDHelper;
private:
int SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...);
int SGDTrace(FILE* __restrict __stream, bool isPrependTimestamp, const char* __restrict __format, ...);
};
}}}

Просмотреть файл

@ -512,7 +512,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.60663s; TotalTimePerSample = 10.42652ms; SamplesPerSecond = 95
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.16469
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: running on localhost at 2015/10/06 12:01:30
MPI Rank 1: command line options:
@ -987,7 +987,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.63654s; TotalTimePerSample = 10.54614ms; SamplesPerSecond = 94
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.15309
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: running on localhost at 2015/10/06 12:01:30
MPI Rank 2: command line options:
@ -1462,7 +1462,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.61322s; TotalTimePerSample = 10.45290ms; SamplesPerSecond = 95
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.16806
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: running on localhost at 2015/10/06 12:01:31
MPI Rank 3: command line options:
@ -1937,5 +1937,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.58782s; TotalTimePerSample = 10.35128ms; SamplesPerSecond = 96
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.17161
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -512,7 +512,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38285ms; SamplesPerSecond = 2612
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.84576
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: running on localhost at 2015/10/06 11:58:26
MPI Rank 1: command line options:
@ -987,7 +987,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38285ms; SamplesPerSecond = 2612
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845791
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: running on localhost at 2015/10/06 11:58:27
MPI Rank 2: command line options:
@ -1462,7 +1462,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38284ms; SamplesPerSecond = 2612
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845644
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: running on localhost at 2015/10/06 11:58:27
MPI Rank 3: command line options:
@ -1937,5 +1937,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38284ms; SamplesPerSecond = 2612
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845718
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -518,7 +518,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09126s; TotalTimePerSample = 0.36503ms; SamplesPerSecond = 2739
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.243167
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank1
MPI Rank 1: -------------------------------------------------------------------
@ -1003,7 +1003,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09356s; TotalTimePerSample = 0.37426ms; SamplesPerSecond = 2671
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.24663
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank2
MPI Rank 2: -------------------------------------------------------------------
@ -1488,7 +1488,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09356s; TotalTimePerSample = 0.37426ms; SamplesPerSecond = 2671
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.246647
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank3
MPI Rank 3: -------------------------------------------------------------------
@ -1973,5 +1973,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09126s; TotalTimePerSample = 0.36503ms; SamplesPerSecond = 2739
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.243121
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -518,7 +518,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13598s; TotalTimePerSample = 0.54394ms; SamplesPerSecond = 1838
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509512
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
MPI Rank 1: -------------------------------------------------------------------
@ -1003,7 +1003,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13600s; TotalTimePerSample = 0.54401ms; SamplesPerSecond = 1838
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509397
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
MPI Rank 2: -------------------------------------------------------------------
@ -1488,7 +1488,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13601s; TotalTimePerSample = 0.54403ms; SamplesPerSecond = 1838
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509323
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
MPI Rank 3: -------------------------------------------------------------------
@ -1973,5 +1973,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13601s; TotalTimePerSample = 0.54403ms; SamplesPerSecond = 1838
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509212
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -496,7 +496,7 @@ MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56086s; TotalTimePerSample = 22.24344ms; SamplesPerSecond = 44
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57143s; TotalTimePerSample = 22.28572ms; SamplesPerSecond = 44
MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.08354
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: running on localhost at 2015/08/25 20:50:23
MPI Rank 1: command line options:
@ -955,7 +955,7 @@ MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56099s; TotalTimePerSample = 22.24397ms; SamplesPerSecond = 44
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57339s; TotalTimePerSample = 22.29357ms; SamplesPerSecond = 44
MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.07455
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: running on localhost at 2015/08/25 20:50:24
MPI Rank 2: command line options:
@ -1414,7 +1414,7 @@ MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56393s; TotalTimePerSample = 22.25572ms; SamplesPerSecond = 44
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57187s; TotalTimePerSample = 22.28747ms; SamplesPerSecond = 44
MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.08799
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: running on localhost at 2015/08/25 20:50:24
MPI Rank 3: command line options:
@ -1873,5 +1873,5 @@ MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.54955s; TotalTimePerSample = 22.19822ms; SamplesPerSecond = 45
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.58100s; TotalTimePerSample = 22.32401ms; SamplesPerSecond = 44
MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.07455
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -631,7 +631,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50281ms; SamplesPerSecond = 1988
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018144
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: running on localhost at 2015/10/24 12:44:54
MPI Rank 1: command line:
@ -1225,7 +1225,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.01855
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: running on localhost at 2015/10/24 12:44:54
MPI Rank 2: command line:
@ -1819,7 +1819,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018583
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: running on localhost at 2015/10/24 12:44:55
MPI Rank 3: command line:
@ -2413,5 +2413,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50282ms; SamplesPerSecond = 1988
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018182
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -502,7 +502,7 @@ MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12883s; TotalTimePerSample = 0.51532ms; SamplesPerSecond = 1940
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035
MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887408
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank1
MPI Rank 1: -------------------------------------------------------------------
@ -971,7 +971,7 @@ MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12881s; TotalTimePerSample = 0.51526ms; SamplesPerSecond = 1940
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12287s; TotalTimePerSample = 0.49148ms; SamplesPerSecond = 2034
MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887381
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank2
MPI Rank 2: -------------------------------------------------------------------
@ -1440,7 +1440,7 @@ MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12898s; TotalTimePerSample = 0.51592ms; SamplesPerSecond = 1938
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12295s; TotalTimePerSample = 0.49182ms; SamplesPerSecond = 2033
MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887366
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank3
MPI Rank 3: -------------------------------------------------------------------
@ -1909,5 +1909,5 @@ MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12882s; TotalTimePerSample = 0.51529ms; SamplesPerSecond = 1940
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12279s; TotalTimePerSample = 0.49116ms; SamplesPerSecond = 2036
MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887371
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -636,7 +636,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931563
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 0: COMPLETED
MPI Rank 0: __COMPLETED__
MPI Rank 0: ~MPIWrapper
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
MPI Rank 1: -------------------------------------------------------------------
@ -1239,7 +1239,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931591
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 1: COMPLETED
MPI Rank 1: __COMPLETED__
MPI Rank 1: ~MPIWrapper
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
MPI Rank 2: -------------------------------------------------------------------
@ -1842,7 +1842,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12606s; TotalTimePerSample = 0.50426ms; SamplesPerSecond = 1983
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931381
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 2: COMPLETED
MPI Rank 2: __COMPLETED__
MPI Rank 2: ~MPIWrapper
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
MPI Rank 3: -------------------------------------------------------------------
@ -2445,5 +2445,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931393
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
MPI Rank 3: COMPLETED
MPI Rank 3: __COMPLETED__
MPI Rank 3: ~MPIWrapper

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -11,5 +11,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1 +1 @@
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9,5 +9,5 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__

Просмотреть файл

@ -2533,4 +2533,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
Allocating matrices for forward and/or backward propagation.
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9591762
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9591762 Perplexity = 1052.766
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1075,9 +1075,11 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
Allocating matrices for forward and/or backward propagation.
Minibatch[1-32]: SamplesSeen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9611959
Final Results: Minibatch[1-32]: SamplesSeen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9611959 Perplexity = 1054.8943
Action "test" complete.
COMPLETED
__COMPLETED__

Просмотреть файл

@ -2981,4 +2981,4 @@ CUDA error 11 [c:\tools\cub-1.4.1\cub\device\dispatch/dispatch_radix_sort.cuh, 7
CUDA error 11 [c:\tools\cub-1.4.1\cub\device\dispatch/dispatch_radix_sort.cuh, 796]: invalid argument
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9566009
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9566009 Perplexity = 1050.0582
COMPLETED
__COMPLETED__

Просмотреть файл

@ -2533,4 +2533,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
Allocating matrices for forward and/or backward propagation.
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9640379
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9640379 Perplexity = 1057.8967
COMPLETED
__COMPLETED__

Просмотреть файл

@ -8,24 +8,24 @@ tags:
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
- __COMPLETED__
Must train epochs in exactly same order and parameters:
patterns:
- ^Starting Epoch {{integer}}
- Starting Epoch {{integer}}
- learning rate per sample = {{float}}
- momentum = {{float}}
Epochs must be finished with expected results:
patterns:
- ^Finished Epoch[{{integer}} of {{integer}}]
- Finished Epoch[{{integer}} of {{integer}}]
- TrainLossPerSample = {{float,tolerance=.2%}}
- EvalErrPerSample = {{float,tolerance=.2%}}
- AvgLearningRatePerSample = {{float,tolerance=0.001%}}
Per-minibatch training results must match:
patterns:
- ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- SamplesSeen = {{integer}}
- TrainLossPerSample = {{float,tolerance=.2%}}

Просмотреть файл

@ -913,7 +913,7 @@ already there from last epoch
RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 38, 46, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30271576 Perplexity = 1.3535297
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
@ -1734,4 +1734,4 @@ already there from last epoch
RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 38, 46, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31798401 Perplexity = 1.3743543
COMPLETED
__COMPLETED__

Просмотреть файл

@ -915,7 +915,7 @@ RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 38, 46, ...
MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30270519 Perplexity = 1.3535154
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
-------------------------------------------------------------------
@ -1737,4 +1737,4 @@ RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 38, 46, ...
MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31781933 Perplexity = 1.374128
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1468,7 +1468,7 @@ already there from last epoch
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
randomordering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.31800278 Perplexity = 1.3743801
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
-------------------------------------------------------------------
@ -2844,4 +2844,4 @@ already there from last epoch
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
randomordering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.33039909 Perplexity = 1.3915234
COMPLETED
__COMPLETED__

Просмотреть файл

@ -910,7 +910,7 @@ already there from last epoch
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111851 Perplexity = 1.3379231
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
-------------------------------------------------------------------
@ -1727,4 +1727,4 @@ already there from last epoch
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440025 Perplexity = 1.3558116
COMPLETED
__COMPLETED__

Просмотреть файл

@ -1410,7 +1410,7 @@ already there from last epoch
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
randomordering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.31800278 Perplexity = 1.3743801
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
-------------------------------------------------------------------
@ -2728,4 +2728,4 @@ already there from last epoch
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
randomordering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.33039909 Perplexity = 1.3915234
COMPLETED
__COMPLETED__

Просмотреть файл

@ -910,7 +910,7 @@ already there from last epoch
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111847 Perplexity = 1.3379231
COMPLETED
__COMPLETED__
=== Deleting last epoch data
==== Re-running from checkpoint
-------------------------------------------------------------------
@ -1727,4 +1727,4 @@ already there from last epoch
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
RandomOrdering: recached sequence for seed 0: 15, 33, ...
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440022 Perplexity = 1.3558116
COMPLETED
__COMPLETED__

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше