Prepend timestamp to log lines when tracing flag is turned on
This commit is contained in:
Родитель
5a574305b0
Коммит
04abf66d78
|
@ -6411,4 +6411,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698
|
||||
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698 Perplexity = 1.4210798
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9899,4 +9899,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767
|
||||
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767 Perplexity = 1.35456
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -70,7 +70,7 @@ void TestCn(const ConfigParameters& config);
|
|||
|
||||
void RedirectStdErr(wstring logpath)
|
||||
{
|
||||
fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
|
||||
LOGPRINTF(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
|
||||
auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
|
||||
if (dup2(fileno(*f), 2) == -1)
|
||||
{
|
||||
|
@ -165,7 +165,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
|
||||
if (numCPUThreads > 0)
|
||||
{
|
||||
std::cerr << "Using " << numCPUThreads << " CPU threads." << endl;
|
||||
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
}
|
||||
|
||||
bool progressTracing = config(L"progressTracing", false);
|
||||
|
@ -187,14 +187,14 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
if (action[j] == "train" || action[j] == "trainRNN")
|
||||
{
|
||||
wstring modelPath = commandParams("modelPath");
|
||||
std::wcerr << "CNTKModelPath: " << modelPath << endl;
|
||||
LOGPRINTF(stderr, "CNTKModelPath: %ls\n", modelPath.c_str());
|
||||
size_t maxEpochs = GetMaxEpochs(commandParams);
|
||||
std::cerr << "CNTKCommandTrainInfo: " + command[i] << " : " << maxEpochs << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainInfo: %s : %d\n", command[i].c_str(), (int) maxEpochs);
|
||||
fullTotalMaxEpochs += maxEpochs;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : %d\n", (int) fullTotalMaxEpochs);
|
||||
|
||||
// set up progress tracing for compute cluster management
|
||||
if (progressTracing && (!mpi || mpi->IsMainNode()))
|
||||
|
@ -225,19 +225,20 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
// print a banner to visually separate each action in the log
|
||||
const char* delim = "##############################################################################";
|
||||
const char* prefix = "Action ";
|
||||
fprintf(stderr, "\n%s\n", delim);
|
||||
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
fprintf(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
|
||||
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
fprintf(stderr, "%s\n\n", delim);
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "%s\n", delim);
|
||||
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
LOGPRINTF(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
|
||||
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
LOGPRINTF(stderr, "%s\n\n", delim);
|
||||
|
||||
if ((mpi == nullptr) || (commandstoRunOnAllRanks.find(thisAction) != commandstoRunOnAllRanks.end()) || mpi->IsMainNode())
|
||||
{
|
||||
if (thisAction == "train" || thisAction == "trainRNN")
|
||||
{
|
||||
std::cerr << "CNTKCommandTrainBegin: " + command[i] << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
|
||||
DoTrain<ConfigParameters, ElemType>(commandParams);
|
||||
std::cerr << "CNTKCommandTrainEnd: " + command[i] << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
|
||||
fullEpochsOffset += GetMaxEpochs(commandParams);
|
||||
}
|
||||
else if (thisAction == "adapt")
|
||||
|
@ -298,7 +299,8 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nAction \"%s\" complete.\n\n", thisAction.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
|
||||
|
||||
NDLScript<ElemType> ndlScript;
|
||||
ndlScript.ClearGlobal(); // clear global macros between commands
|
||||
|
@ -321,51 +323,51 @@ std::string TimeDateStamp()
|
|||
|
||||
void PrintBuiltInfo()
|
||||
{
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
fprintf(stderr, "Build info: \n\n");
|
||||
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Build info: \n\n");
|
||||
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
#ifdef _BUILDTYPE_
|
||||
fprintf(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
#endif
|
||||
#ifdef _BUILDTARGET_
|
||||
fprintf(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
#endif
|
||||
#ifdef _WITH_1BITSGD_
|
||||
fprintf(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
#endif
|
||||
#ifdef _MATHLIB_
|
||||
fprintf(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
#endif
|
||||
#ifdef _CUDA_PATH_
|
||||
fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
#endif
|
||||
#ifdef _CUB_PATH_
|
||||
fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
#endif
|
||||
#ifdef _CUDNN_PATH_
|
||||
fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
#endif
|
||||
#ifdef _GIT_EXIST
|
||||
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
#endif
|
||||
#ifdef _BUILDER_
|
||||
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
#endif
|
||||
#ifdef _BUILDPATH_
|
||||
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
#endif
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
void PrintUsageInfo()
|
||||
{
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
fprintf(stderr, "Usage: cntk configFile=yourConfigFile\n");
|
||||
fprintf(stderr, "For detailed information please consult the CNTK book\n");
|
||||
fprintf(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Usage: cntk configFile=yourConfigFile\n");
|
||||
LOGPRINTF(stderr, "For detailed information please consult the CNTK book\n");
|
||||
LOGPRINTF(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -412,9 +414,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
wstring startupMessage = msra::strfun::wstrprintf(L"running on %ls at %ls\n", msra::strfun::utf16(GetHostName()).c_str(), msra::strfun::utf16(TimeDateStamp()).c_str());
|
||||
startupMessage += msra::strfun::wstrprintf(L"command line: %ls", exePath.c_str());
|
||||
for (const auto& arg : args)
|
||||
{
|
||||
startupMessage += L" " + arg;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%ls\n", startupMessage.c_str());
|
||||
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
|
||||
|
||||
// parse command-line options
|
||||
vector<wstring> sourceFiles;
|
||||
|
@ -443,6 +447,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
// compile the BrainScript
|
||||
wstring bs = L"[\n";
|
||||
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
|
||||
|
||||
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
|
||||
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
|
||||
for (const auto& sourceFile : sourceFiles)
|
||||
|
@ -451,7 +456,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
for (const auto& over : overrides)
|
||||
bs += L"with [ " + over + L" ]\n";
|
||||
|
||||
fprintf(stderr, "\n\nBrainScript -->\n\n%ls\n\n", bs.c_str());
|
||||
fprintf(stderr, "\n\n");
|
||||
LOGPRINTF(stderr, "BrainScript -->\n\n%ls\n\n", bs.c_str());
|
||||
|
||||
let expr = BS::ParseConfigExpression(bs, move(includePaths)); // parse
|
||||
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
|
||||
|
@ -486,7 +492,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
logpath += msra::strfun::wstrprintf(L"rank%d", (int) mpi->CurrentNodeRank());
|
||||
|
||||
RedirectStdErr(logpath);
|
||||
fprintf(stderr, "%ls\n", startupMessage.c_str());
|
||||
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
|
||||
}
|
||||
|
||||
// echo config info to log
|
||||
|
@ -497,10 +503,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
int numCPUThreads = config(L"numCPUThreads", 0);
|
||||
numCPUThreads = CPUMatrix<float /*any will do*/>::SetNumThreads(numCPUThreads);
|
||||
if (numCPUThreads > 0)
|
||||
fprintf(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
|
||||
bool progressTracing = config(L"progressTracing", false);
|
||||
size_t fullTotalMaxEpochs = 1; // BUGBUG: BS does not allow me to read out the max epochs parameters, as that would instantiate and thus execute the objects
|
||||
|
||||
// set up progress tracing for compute cluster management
|
||||
if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
|
||||
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
|
||||
|
@ -532,7 +539,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
|
||||
fcloseOrDie(fp);
|
||||
}
|
||||
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
|
||||
LOGPRINTF(stderr, "__COMPLETED__\n");
|
||||
fflush(stderr);
|
||||
|
||||
MPIWrapper::DeleteInstance();
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -585,36 +593,51 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
PrintBuiltInfo(); // this one goes to log file
|
||||
std::string timestamp = TimeDateStamp();
|
||||
|
||||
bool timestamping = config(L"timestamping", false);
|
||||
if (timestamping)
|
||||
{
|
||||
ProgressTracing::SetTimestampingFlag();
|
||||
}
|
||||
|
||||
// dump config info
|
||||
fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
|
||||
fprintf(stderr, "Command line: \n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
|
||||
LOGPRINTF(stderr, "Command line: \n");
|
||||
for (int i = 0; i < argc; i++)
|
||||
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
|
||||
{
|
||||
// use 2 spaces for better visual separability
|
||||
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]);
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
#if 1 //def _DEBUG
|
||||
// This simply merges all the different config parameters specified (eg, via config files or via command line directly),
|
||||
// and prints it.
|
||||
fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "%s\n", rawConfigString.c_str());
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
|
||||
fprintf(stderr, "\n\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
|
||||
LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
|
||||
|
||||
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line),
|
||||
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overridden at command line),
|
||||
// All of these assignments will appear, even though only the last assignment matters.
|
||||
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
|
||||
// This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
|
||||
// value it is set to will appear).
|
||||
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
config.dumpWithResolvedVariables();
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "Commands:");
|
||||
LOGPRINTF(stderr, "Commands:");
|
||||
for (int i = 0; i < command.size(); i++)
|
||||
{
|
||||
fprintf(stderr, " %s", command[i].c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
// run commands
|
||||
|
@ -623,7 +646,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
if (config.Exists("type"))
|
||||
InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");
|
||||
|
||||
fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
|
||||
LOGPRINTF(stderr, "Precision = \"%s\"\n", type.c_str());
|
||||
|
||||
if (type == "float")
|
||||
DoCommands<float>(config, mpi);
|
||||
else if (type == "double")
|
||||
|
@ -638,7 +662,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
|
||||
fcloseOrDie(fp);
|
||||
}
|
||||
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
|
||||
LOGPRINTF(stderr, "__COMPLETED__\n");
|
||||
fflush(stderr);
|
||||
|
||||
MPIWrapper::DeleteInstance();
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -664,38 +689,45 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
|
|||
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
|
||||
if (argc <= 1)
|
||||
{
|
||||
fprintf(stderr, "No command-line argument given.\n");
|
||||
LOGPRINTF(stderr, "No command-line argument given.\n");
|
||||
PrintUsageInfo();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
// detect legacy CNTK configuration
|
||||
bool isOldCNTKConfig = false;
|
||||
for (int i = 0; i < argc && !isOldCNTKConfig; i++)
|
||||
isOldCNTKConfig |= !_wcsnicmp(L"configFile=", argv[i], 11);
|
||||
|
||||
if (isOldCNTKConfig)
|
||||
return wmainOldCNTKConfig(argc, argv);
|
||||
|
||||
// run from BrainScript
|
||||
return wmainWithBS(argc, argv);
|
||||
}
|
||||
catch (const ScriptableObjects::ScriptingException& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
|
||||
err.PrintError();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (const IExceptionWithCallStackBase& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (const std::exception& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
fprintf(stderr, "\nUnknown ERROR occurred\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Unknown ERROR occurred\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
@ -703,7 +735,8 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
|
|||
#ifdef __WINDOWS__
|
||||
void TerminateThis()
|
||||
{
|
||||
fprintf(stderr, "terminate_this: aborting\n"), fflush(stderr);
|
||||
LOGPRINTF(stderr, "terminate_this: aborting\n");
|
||||
fflush(stderr);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
@ -714,7 +747,7 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
|
|||
if (pExcPointers->ExceptionRecord->ExceptionCode == EXCEPTION_DLL_NOT_FOUND)
|
||||
{
|
||||
const auto & pDelayLoadInfo = *PDelayLoadInfo(pExcPointers->ExceptionRecord->ExceptionInformation[0]);
|
||||
fprintf(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
|
||||
LOGPRINTF(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -736,7 +769,7 @@ int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 excepti
|
|||
else if (code == EXCEPTION_INT_DIVIDE_BY_ZERO) msg = ": Integer division by zero";
|
||||
else if (code == EXCEPTION_STACK_OVERFLOW) msg = ": Stack overflow";
|
||||
else if (code == EXCEPTION_DLL_NOT_FOUND) msg = ": Module not found";
|
||||
fprintf(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
|
||||
LOGPRINTF(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
|
||||
fflush(stderr);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
|
|
@ -4,10 +4,32 @@
|
|||
//
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include "TimerUtility.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// If the Tracing flag is set, print out a timestamp with no new line at the end
|
||||
#define PREPENDTS(stream) \
|
||||
do \
|
||||
{ \
|
||||
if (ProgressTracing::GetTimestampingFlag()) \
|
||||
{ \
|
||||
std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
|
||||
char mbstr[30]; \
|
||||
if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
|
||||
fprintf(stream, "%s: ", mbstr); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// Print out a log message. If the Tracing flag is set, prepend with a timestamp
|
||||
#define LOGPRINTF(stream, ...) \
|
||||
do \
|
||||
{ \
|
||||
PREPENDTS(stream); \
|
||||
fprintf(stream, __VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ProgressTracing -- static helper class for logging a progress indicator
|
||||
//
|
||||
|
@ -29,12 +51,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
bool m_enabled;
|
||||
bool m_tracingFlag;
|
||||
bool m_timestampFlag;
|
||||
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
|
||||
size_t m_currentStepOffset; // current offset
|
||||
Timer m_progressTracingTimer;
|
||||
|
||||
ProgressTracing()
|
||||
: m_enabled(false), m_tracingFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
|
||||
: m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -50,12 +73,23 @@ public:
|
|||
return GetStaticInstance().m_tracingFlag;
|
||||
}
|
||||
|
||||
static bool GetTimestampingFlag()
|
||||
{
|
||||
return GetStaticInstance().m_timestampFlag;
|
||||
}
|
||||
|
||||
static void SetTracingFlag()
|
||||
{
|
||||
auto& us = GetStaticInstance();
|
||||
us.m_tracingFlag = true;
|
||||
}
|
||||
|
||||
static void SetTimestampingFlag()
|
||||
{
|
||||
auto& us = GetStaticInstance();
|
||||
us.m_timestampFlag = true;
|
||||
}
|
||||
|
||||
// call TraceTotalNumberOfSteps() to set the total number of steps
|
||||
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
|
||||
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
|
||||
|
|
|
@ -609,11 +609,6 @@ void renameOrDie(const std::string& from, const std::string& to)
|
|||
// WORKAROUND: "rename" should do this but this is a workaround
|
||||
// to the HDFS FUSE implementation's bug of failing to do so
|
||||
// workaround for FUSE rename when running on Philly
|
||||
if (ProgressTracing::GetTracingFlag())
|
||||
{
|
||||
fprintf(stderr, "rename %s to %s\n", from.c_str(), to.c_str());
|
||||
}
|
||||
|
||||
unlinkOrDie(to);
|
||||
if (rename(from.c_str(), to.c_str()) != 0)
|
||||
{
|
||||
|
|
|
@ -94,7 +94,9 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
|
|||
if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
|
||||
{
|
||||
// assert(node->Input(i)->m_indexInLoop == 0); // No. It seems this variable really counts the number of parents.
|
||||
node->Input(i)->m_indexInLoop++; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
|
||||
|
||||
// BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
|
||||
node->Input(i)->m_indexInLoop++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -114,9 +114,11 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
{
|
||||
// instead of the node itself, include the sentinel SEQTraversalFlowControlNode in our list
|
||||
m_nestedNodes.push_back(recInfo);
|
||||
|
||||
// and verify that we only encountered the loop once (all nodes should have been consecutive)
|
||||
if (!loopsSeen.insert(recInfo).second)
|
||||
LogicError("PARTraversalFlowControlNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
|
||||
|
||||
// consume all nodes that are part of the same loop (they are all consecutive)
|
||||
while (nodeIter != allNodes.end() && (*nodeIter)->IsPartOfLoop() && FindInRecurrentLoops(recurrentInfo, *nodeIter) == recInfo)
|
||||
nodeIter++;
|
||||
|
@ -303,8 +305,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
// look in all recurrent loops of the network
|
||||
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
|
||||
for (auto& iter : recurrentInfo)
|
||||
{
|
||||
if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of SEQTraversalFlowControlNode?
|
||||
return iter;
|
||||
}
|
||||
return nullptr; // not part of a recurrent loop
|
||||
}
|
||||
|
||||
|
@ -357,8 +361,10 @@ void ComputationNetwork::PrintComputationTree(const ComputationNodeBasePtr& root
|
|||
if (nodes.size() == 0)
|
||||
fprintf(stderr, "\n(empty)\n");
|
||||
else
|
||||
{
|
||||
for (const auto& node : nodes)
|
||||
node->PrintSelf(printMatrices);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -397,9 +403,11 @@ void ComputationNetwork::CompileNetwork()
|
|||
// all steps below have to be repeated for all root nodes (=nodes without parents and PreComputeNodes)
|
||||
DetermineSetOfAllRoots();
|
||||
|
||||
fprintf(stderr, "\n%d roots:\n", (int) m_allRoots.size());
|
||||
fprintf(stderr, "\n%d roots:\n", (int)m_allRoots.size());
|
||||
for (const auto& root : m_allRoots)
|
||||
{
|
||||
fprintf(stderr, "\t%ls = %ls\n", root->NodeName().c_str(), root->OperationName().c_str());
|
||||
}
|
||||
|
||||
// Note: Steps below are loops over root nodes. We will gradually push those loops through to the functions,
|
||||
// to reduce redundant operation on shared portions of the network.
|
||||
|
@ -473,10 +481,13 @@ void ComputationNetwork::DetermineSetOfAllRoots()
|
|||
set<ComputationNodeBasePtr> allKnownRoots;
|
||||
for (const auto& node : FinalCriterionNodes())
|
||||
allKnownRoots.insert(node);
|
||||
|
||||
for (const auto& node : EvaluationNodes())
|
||||
allKnownRoots.insert(node);
|
||||
|
||||
for (const auto& node : OutputNodes())
|
||||
allKnownRoots.insert(node);
|
||||
|
||||
for (const auto& iter : m_nameToNodeMap) // PreComputeNodes
|
||||
{
|
||||
auto node = iter.second;
|
||||
|
@ -513,7 +524,9 @@ void ComputationNetwork::ValidateNetwork()
|
|||
// set up MBLayout links of inputs (all others get propagated upwards through Validate())
|
||||
// TODO: Once we support mismatching layouts, this will be more involved. For now, everything shares the one layout that the Network knows about.
|
||||
for (auto node : InputNodes(nullptr))
|
||||
{
|
||||
node->LinkToMBLayout(m_pMBLayout);
|
||||
}
|
||||
|
||||
// we call all nodes' Validate() in order to validate, that is, set up MBLayout and FunctionValues dimension
|
||||
// A problem is that recurrent loops may require partial validation.
|
||||
|
@ -542,6 +555,7 @@ void ComputationNetwork::ValidateNetwork()
|
|||
}
|
||||
fprintf(stderr, "\nValidating network, final pass.\n\n");
|
||||
toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/);
|
||||
|
||||
if (toValidate != 0)
|
||||
LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");
|
||||
|
||||
|
@ -571,7 +585,7 @@ void ComputationNetwork::ValidateNetwork()
|
|||
}
|
||||
if (!nonDefaultNodes.empty())
|
||||
{
|
||||
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int) nonDefaultNodes.size(), (int) nodes.size());
|
||||
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int)nonDefaultNodes.size(), (int)nodes.size());
|
||||
// for (auto node : nonDefaultNodes)
|
||||
// fprintf(stderr, " %ls\n", node->NodeName().c_str());
|
||||
// fprintf(stderr, "\n\n");
|
||||
|
@ -631,6 +645,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
|
|||
hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
|
||||
allChildrenVisited &= child->m_visited;
|
||||
}
|
||||
|
||||
// if there is not at least one visited child
|
||||
bool valid = false;
|
||||
if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
|
||||
|
@ -652,8 +667,10 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
|
|||
node->m_visited = true;
|
||||
// print the new type
|
||||
// sanity checks
|
||||
|
||||
if (isFinalValidationPass && !unchanged)
|
||||
LogicError("ValidateSubNetwork: %ls %ls operation changed during final validation.", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
|
||||
if (isFinalValidationPass && !allChildrenVisited)
|
||||
LogicError("ValidateSubNetwork: %ls %ls operation in final validation although not all children were visited?", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
// if all children valid then
|
||||
|
@ -830,7 +847,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
else
|
||||
{
|
||||
nodeIter->RequestMatricesBeforeForwardProp(m_matrixPool);
|
||||
// we only release matrices for the children since the root node's informatioin will be used and should not be shared
|
||||
// we only release matrices for the children since the root node's information will be used and should not be shared
|
||||
// with others
|
||||
ReleaseMatricesAfterEvalForChildren(nodeIter, parentCount);
|
||||
}
|
||||
|
|
|
@ -44,22 +44,23 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
|
|||
int startEpoch = DetermineStartEpoch(makeMode);
|
||||
if (startEpoch == m_maxEpochs)
|
||||
{
|
||||
fprintf(stderr, "No further training is necessary.\n");
|
||||
LOGPRINTF(stderr, "No further training is necessary.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
|
||||
bool loadNetworkFromCheckpoint = startEpoch >= 0;
|
||||
fprintf(stderr, "\n");
|
||||
if (loadNetworkFromCheckpoint)
|
||||
fprintf(stderr, "\nStarting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
|
||||
LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
|
||||
else
|
||||
fprintf(stderr, "\nCreating virgin network.\n");
|
||||
LOGPRINTF(stderr, "Creating virgin network.\n");
|
||||
|
||||
// create or load from checkpoint
|
||||
shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
|
||||
|
||||
// log the device we are computing on
|
||||
fprintf(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
|
||||
LOGPRINTF(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
|
||||
if (net->GetDeviceId() < 0)
|
||||
fprintf(stderr, " on CPU.\n");
|
||||
else
|
||||
|
@ -74,6 +75,7 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
|
|||
// set tracing flags
|
||||
for (const auto& traceNodeName : m_traceNodeNamesReal)
|
||||
net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/false);
|
||||
|
||||
for (const auto& traceNodeName : m_traceNodeNamesCategory)
|
||||
net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/true);
|
||||
|
||||
|
@ -93,7 +95,7 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
|
|||
int startEpoch = DetermineStartEpoch(makeMode);
|
||||
if (startEpoch == m_maxEpochs)
|
||||
{
|
||||
fprintf(stderr, "No further training is necessary.\n");
|
||||
LOGPRINTF(stderr, "No further training is necessary.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -102,13 +104,13 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
|
|||
if (startEpoch >= 0)
|
||||
{
|
||||
wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
|
||||
fprintf(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
|
||||
LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
|
||||
net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
|
||||
networkLoadedFromCheckpoint = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
|
||||
LOGPRINTF(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
|
||||
net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
|
||||
}
|
||||
|
||||
|
@ -118,14 +120,14 @@ void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
|
|||
m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
|
||||
if (m_needAdaptRegularization)
|
||||
{
|
||||
fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
|
||||
LOGPRINTF(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
|
||||
refNet = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
|
||||
}
|
||||
|
||||
ComputationNodeBasePtr refNode;
|
||||
if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
|
||||
{
|
||||
fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
|
||||
LOGPRINTF(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
|
||||
if (refNodeName == L"")
|
||||
InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
|
||||
refNode = refNet->GetNodeFromName(refNodeName);
|
||||
|
@ -152,9 +154,12 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
auto& labelNodes = net->LabelNodes();
|
||||
auto& criterionNodes = GetTrainCriterionNodes(net);
|
||||
|
||||
fprintf(stderr, "\nTraining criterion node(s):\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Training criterion node(s):\n");
|
||||
for (const auto& node : criterionNodes)
|
||||
fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
}
|
||||
|
||||
// determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
|
||||
std::vector<ComputationNodeBasePtr> evaluationNodes;
|
||||
|
@ -170,9 +175,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
if (!evaluationNodes.empty())
|
||||
{
|
||||
fprintf(stderr, "\nEvaluation criterion node(s):\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Evaluation criterion node(s):\n");
|
||||
fprintf(stderr, "\n");
|
||||
for (const auto& node : evaluationNodes)
|
||||
fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -389,8 +398,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
if (learnRatePerSample < m_minLearnRate)
|
||||
{
|
||||
fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
|
||||
i + 1, learnRatePerSample, m_minLearnRate);
|
||||
LOGPRINTF(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
|
||||
i + 1, learnRatePerSample, m_minLearnRate);
|
||||
if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
|
||||
{
|
||||
// In case of parallel training only the main node should we saving the model to prevent
|
||||
|
@ -440,8 +449,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
|
||||
: momentumPerSample >= 1.0 ? 0.0
|
||||
: -1.0 / log(momentumPerSample);
|
||||
fprintf(stderr, "\nStarting Epoch %d: learning rate per sample = %f effective momentum = %f momentum as time constant = %.1f samples\n",
|
||||
i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Starting Epoch %d: learning rate per sample = %f effective momentum = %f momentum as time constant = %.1f samples\n",
|
||||
i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
|
||||
|
||||
TrainOneEpoch(net,
|
||||
refNet,
|
||||
|
@ -473,9 +483,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
lrControlCriterion = epochCriterion;
|
||||
}
|
||||
|
||||
fprintf(stderr,
|
||||
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; TotalSamplesSeen = %d; ",
|
||||
i + 1, (int)m_maxEpochs, epochCriterion, (int)totalSamplesSeen);
|
||||
LOGPRINTF(stderr,
|
||||
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; TotalSamplesSeen = %d; ",
|
||||
i + 1, (int)m_maxEpochs, epochCriterion, (int)totalSamplesSeen);
|
||||
m_lastFinishedEpochTrainLoss = epochCriterion;
|
||||
if (epochEvalErrors.size() == 0) // no eval criterion, only train criterion itself
|
||||
{
|
||||
|
@ -501,13 +511,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
learnRatePerSample, epochTime);
|
||||
|
||||
// TODO: why these extra log messages here and not for 1 eval criterion?
|
||||
fprintf(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
|
||||
i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
|
||||
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
|
||||
i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
|
||||
|
||||
for (size_t j = 0; j < epochEvalErrors.size(); j++)
|
||||
{
|
||||
fprintf(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
|
||||
i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
|
||||
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
|
||||
i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -526,7 +536,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
// BUGBUG: We should not use the training MB size. The training MB size is constrained by both convergence and memory. Eval is only constrained by memory.
|
||||
vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
|
||||
fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
|
||||
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
|
||||
if (vScore.size() > 1)
|
||||
{
|
||||
fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
|
||||
|
@ -575,7 +585,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
if (m_loadBestModel)
|
||||
{
|
||||
auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
|
||||
fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
|
||||
LOGPRINTF(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
|
||||
net->RereadPersistableParameters<ElemType>(bestModelPath);
|
||||
LoadCheckPointInfo(i - m_learnRateAdjustInterval,
|
||||
/*out*/ totalSamplesSeen,
|
||||
|
@ -604,7 +614,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
|
||||
net->Save(GetModelNameForEpoch(i, true));
|
||||
|
||||
fprintf(stderr, "Finished training and saved final model\n\n");
|
||||
LOGPRINTF(stderr, "Finished training and saved final model\n\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -612,7 +622,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
if (learnRateReduced)
|
||||
{
|
||||
learnRatePerSample *= m_learnRateDecreaseFactor;
|
||||
fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
|
||||
LOGPRINTF(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -623,13 +633,13 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
{
|
||||
|
||||
learnRatePerSample *= m_learnRateDecreaseFactor;
|
||||
fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
|
||||
LOGPRINTF(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
|
||||
}
|
||||
else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
|
||||
prevCriterion != std::numeric_limits<double>::infinity())
|
||||
{
|
||||
learnRatePerSample *= m_learnRateIncreaseFactor;
|
||||
fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
|
||||
LOGPRINTF(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -659,7 +669,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
{
|
||||
SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
|
||||
auto modelName = GetModelNameForEpoch(i);
|
||||
fprintf(stderr, "SGD: Saving checkpoint model '%ls'\n", modelName.c_str());
|
||||
LOGPRINTF(stderr, "SGD: Saving checkpoint model '%ls'\n", modelName.c_str());
|
||||
net->Save(modelName);
|
||||
if (!m_keepCheckPointFiles)
|
||||
{
|
||||
|
@ -684,8 +694,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
if (learnRatePerSample < 1e-12)
|
||||
{
|
||||
fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
|
||||
learnRatePerSample);
|
||||
LOGPRINTF(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
|
||||
learnRatePerSample);
|
||||
}
|
||||
}
|
||||
// --- END OF MAIN EPOCH LOOP
|
||||
|
@ -812,6 +822,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
// Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
|
||||
DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
|
||||
size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded<ElemType>(trainSetDataReader, m_maxSamplesInRAM, m_numSubminiBatches, tunedMBSize);
|
||||
|
||||
// this is non-trivial, we need a manager object to handle this
|
||||
if (numSubminibatchesNeeded > 1)
|
||||
smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
|
||||
|
@ -824,26 +835,30 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
// TODO: move the two-forward-pass support out of the reader, make a first-class citizen.
|
||||
AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
|
||||
|
||||
fprintf(stderr, "\nStarting minibatch loop");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Starting minibatch loop");
|
||||
if (useGradientAggregation)
|
||||
{
|
||||
fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
|
||||
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits);
|
||||
|
||||
if (m_bufferedAsyncGradientAggregation)
|
||||
{
|
||||
fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
|
||||
}
|
||||
}
|
||||
|
||||
if (useDistributedMBReading)
|
||||
{
|
||||
fprintf(stderr, ", distributed reading is ENABLED");
|
||||
}
|
||||
|
||||
if (numSubminibatchesNeeded > 1)
|
||||
{
|
||||
if (m_maxSamplesInRAM < SIZE_MAX)
|
||||
fprintf(stderr, ", with maximum %d samples in RAM", (int) m_maxSamplesInRAM);
|
||||
fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
|
||||
else
|
||||
fprintf(stderr, ", with %d subminibatch", (int) numSubminibatchesNeeded);
|
||||
fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
|
||||
}
|
||||
fprintf(stderr, ".\n");
|
||||
|
||||
|
@ -1103,7 +1118,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
// progress tracing for regular log
|
||||
string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d, %2." + std::to_string(mbProgNumPrecision) + "f%%]: SamplesSeen = %d; TrainLossPerSample = " +
|
||||
GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
|
||||
SGDTrace(stderr, formatString.c_str(),
|
||||
SGDTrace(stderr, true, formatString.c_str(),
|
||||
prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
|
||||
numMBsRun, mbProg * 100, numSamplesLastMBs, trainLossPerSample);
|
||||
}
|
||||
|
@ -1113,7 +1128,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
|
||||
string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d]: SamplesSeen = %d; TrainLossPerSample = " +
|
||||
GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
|
||||
SGDTrace(stderr, formatString.c_str(),
|
||||
SGDTrace(stderr, true, formatString.c_str(),
|
||||
prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
|
||||
numMBsRun, numSamplesLastMBs, trainLossPerSample);
|
||||
m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult;
|
||||
|
@ -1124,11 +1139,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
{
|
||||
evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
|
||||
string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
|
||||
SGDTrace(stderr, formatString.c_str(), i, evalError);
|
||||
SGDTrace(stderr, false, formatString.c_str(), i, evalError);
|
||||
}
|
||||
|
||||
string formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 4, totalTimeInMBs) + "s; SamplesPerSecond = %.1f\n";
|
||||
SGDTrace(stderr, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
|
||||
SGDTrace(stderr, false, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
|
||||
|
||||
// progress tracing for compute cluster management
|
||||
if (wasProgressPrinted)
|
||||
|
@ -1287,13 +1302,16 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
|
|||
|
||||
if (nodes.size() == 0)
|
||||
{
|
||||
fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step.\n");
|
||||
LOGPRINTF(stderr, "No PreCompute nodes found, skipping PreCompute step.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nPrecomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Precomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
|
||||
for (const auto & node : nodes)
|
||||
fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
|
||||
{
|
||||
LOGPRINTF(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
|
||||
}
|
||||
|
||||
// compute
|
||||
ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::preComputing);
|
||||
|
@ -1328,9 +1346,12 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
|
|||
|
||||
// finalize
|
||||
for (auto & node : nodes)
|
||||
{
|
||||
dynamic_pointer_cast<IPreComputeNode>(node)->MarkComputed(true /*done accumulating*/);
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nPrecomputing --> Completed.\n\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Precomputing --> Completed.\n\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1490,8 +1511,8 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
|
|||
bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample : rightLearnRatePerSample;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n",
|
||||
epochNumber + 1, bestLearnRatePerSample, baseCriterion);
|
||||
LOGPRINTF(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n",
|
||||
epochNumber + 1, bestLearnRatePerSample, baseCriterion);
|
||||
|
||||
return bestLearnRatePerSample;
|
||||
}
|
||||
|
@ -1542,8 +1563,8 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
|
|||
if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
|
||||
{
|
||||
// newly started training: any previous MB size stored in the model is to be ignored
|
||||
fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
|
||||
"considered invalid -> resetting\n",
|
||||
LOGPRINTF(stderr, "before epoch .2, previous minibatchSize %zd is "
|
||||
"considered invalid -> resetting\n",
|
||||
m_prevChosenMinibatchSize);
|
||||
m_prevChosenMinibatchSize = 0;
|
||||
}
|
||||
|
@ -1553,9 +1574,9 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
|
|||
(epochNumber + 1) > m_minibatchSizeTuningFrequency &&
|
||||
(epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
|
||||
{
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
|
||||
"in epoch %d skipped, keeping minibatchSize of %zd\n",
|
||||
epochNumber + 1, m_prevChosenMinibatchSize);
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
|
||||
"in epoch %d skipped, keeping minibatchSize of %zd\n",
|
||||
epochNumber + 1, m_prevChosenMinibatchSize);
|
||||
chosenMinibatchSize = m_prevChosenMinibatchSize;
|
||||
}
|
||||
else
|
||||
|
@ -1565,9 +1586,9 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
|
|||
// if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
|
||||
// is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
|
||||
// then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
|
||||
"largest of previous minibatchSize = (%d / 2) or %d\n",
|
||||
(int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
|
||||
"largest of previous minibatchSize = (%d / 2) or %d\n",
|
||||
(int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
|
||||
minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
|
||||
}
|
||||
|
||||
|
@ -1578,8 +1599,8 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
|
|||
{
|
||||
assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
|
||||
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
|
||||
"previous minibatchSize %zd*2\n",
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
|
||||
"previous minibatchSize %zd*2\n",
|
||||
m_prevChosenMinibatchSize);
|
||||
maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
|
||||
}
|
||||
|
@ -1647,8 +1668,9 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
// round mbsize to something meaningful
|
||||
trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
|
||||
|
||||
fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
|
||||
trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
|
||||
trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
|
||||
|
||||
size_t totalSamplesSeen;
|
||||
std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
|
||||
|
@ -1675,7 +1697,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
lastTriedTrialEpochCriterion = baseCriterion;
|
||||
isFirstIteration = false;
|
||||
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
|
||||
}
|
||||
else if (!std::isnan(epochCriterion) &&
|
||||
(epochCriterion > (baseCriterion * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0)))))
|
||||
|
@ -1692,15 +1714,15 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
lastTriedTrialEpochCriterion = epochCriterion;
|
||||
if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
|
||||
{
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
|
||||
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
|
||||
epochCriterion, baseCriterion);
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Keep searching... "
|
||||
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
|
||||
epochCriterion, baseCriterion);
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
|
||||
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
|
||||
(int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
|
||||
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
|
||||
"EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
|
||||
(int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
|
||||
|
||||
return lastTriedTrialMinibatchSize;
|
||||
}
|
||||
|
@ -1732,18 +1754,18 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
|
|||
/*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
|
||||
prefixMsg);
|
||||
|
||||
fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
|
||||
LOGPRINTF(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
|
||||
|
||||
if (epochEvalErrors.size() == 1)
|
||||
fprintf(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
|
||||
LOGPRINTF(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "EvalErrPerSample ");
|
||||
LOGPRINTF(stderr, "EvalErrPerSample ");
|
||||
for (size_t i = 0; i < epochEvalErrors.size(); i++)
|
||||
{
|
||||
fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
|
||||
LOGPRINTF(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
|
||||
}
|
||||
fprintf(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
|
||||
LOGPRINTF(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
|
||||
}
|
||||
|
||||
int baseModelEpoch = epochNumber - 1;
|
||||
|
@ -1813,13 +1835,18 @@ static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
int SGD<ElemType>::SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...)
|
||||
int SGD<ElemType>::SGDTrace(FILE* __restrict __stream, bool isPrependTimestamp, const char* __restrict __format, ...)
|
||||
{
|
||||
int result = 0;
|
||||
if (m_traceLevel > 0)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, __format);
|
||||
if (isPrependTimestamp)
|
||||
{
|
||||
PREPENDTS(__stream);
|
||||
}
|
||||
|
||||
result = vfprintf(__stream, __format, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
@ -1886,10 +1913,10 @@ template <class ElemType>
|
|||
// we use simple linear (instead of log linear) scaling here
|
||||
const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
|
||||
#if DUMPOUTPUT
|
||||
fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
|
||||
learnRatePerSample, momentum, actualMBSize);
|
||||
fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
|
||||
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
|
||||
LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
|
||||
learnRatePerSample, momentum, actualMBSize);
|
||||
LOGPRINTF(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
|
||||
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
|
||||
gradientValues.Print("Gradient Input");
|
||||
smoothedGradient.Print("Smoothed Gradient Input");
|
||||
#endif
|
||||
|
@ -1976,7 +2003,7 @@ void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr& node,
|
|||
const bool useNesterovMomentum) const
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
|
||||
LOGPRINTF(stderr, "Update_%ls\n", node->NodeName().c_str());
|
||||
#endif
|
||||
if (!node->IsParameterUpdateRequired())
|
||||
LogicError("UpdateWeights() called for a learnable ComputationNode which has m_learningRateMultiplier == 0!");
|
||||
|
@ -2072,7 +2099,7 @@ bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
|
|||
wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
|
||||
if (!fexists(checkPointFileName.c_str()))
|
||||
{
|
||||
fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
|
||||
LOGPRINTF(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -2167,7 +2194,7 @@ int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
|
|||
}
|
||||
}
|
||||
if (firstEpoch == m_maxEpochs)
|
||||
fprintf(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
|
||||
LOGPRINTF(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
|
||||
|
||||
return firstEpoch;
|
||||
}
|
||||
|
@ -2201,7 +2228,8 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
|
|||
irow = max(0, irow);
|
||||
icol = max(0, icol);
|
||||
|
||||
fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "###### d%ls######\n", node->NodeName().c_str());
|
||||
|
||||
double eOrg = node->Value()(irow, icol);
|
||||
node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
|
||||
|
@ -2259,8 +2287,9 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
|
|||
bool wrong = (std::isnan(diff) || diff > threshold);
|
||||
if (wrong)
|
||||
{
|
||||
fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
|
||||
node->NodeName().c_str(), eGradNum, eGradErr);
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "d%ls Numeric gradient = %e, Error BP gradient = %e\n",
|
||||
node->NodeName().c_str(), eGradNum, eGradErr);
|
||||
sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
|
||||
node->NodeName().c_str(), eGradNum, eGradErr);
|
||||
errMsgs.push_back(wstrtmp);
|
||||
|
|
|
@ -537,7 +537,7 @@ protected:
|
|||
shared_ptr<IMASGD<ElemType>> m_pMASGDHelper;
|
||||
|
||||
private:
|
||||
int SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...);
|
||||
int SGDTrace(FILE* __restrict __stream, bool isPrependTimestamp, const char* __restrict __format, ...);
|
||||
};
|
||||
|
||||
}}}
|
||||
|
|
|
@ -512,7 +512,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.60663s; TotalTimePerSample = 10.42652ms; SamplesPerSecond = 95
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.16469
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: running on localhost at 2015/10/06 12:01:30
|
||||
MPI Rank 1: command line options:
|
||||
|
@ -987,7 +987,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.63654s; TotalTimePerSample = 10.54614ms; SamplesPerSecond = 94
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.15309
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: running on localhost at 2015/10/06 12:01:30
|
||||
MPI Rank 2: command line options:
|
||||
|
@ -1462,7 +1462,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.61322s; TotalTimePerSample = 10.45290ms; SamplesPerSecond = 95
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.16806
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: running on localhost at 2015/10/06 12:01:31
|
||||
MPI Rank 3: command line options:
|
||||
|
@ -1937,5 +1937,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14587123; EvalErr[0]PerSample = 0.06400000; TotalTime = 2.58782s; TotalTimePerSample = 10.35128ms; SamplesPerSecond = 96
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15919599; EvalErrPerSample = 0.0765; AvgLearningRatePerSample = 0.00800000038; EpochTime=104.17161
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -512,7 +512,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38285ms; SamplesPerSecond = 2612
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.84576
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: running on localhost at 2015/10/06 11:58:26
|
||||
MPI Rank 1: command line options:
|
||||
|
@ -987,7 +987,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38285ms; SamplesPerSecond = 2612
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845791
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: running on localhost at 2015/10/06 11:58:27
|
||||
MPI Rank 2: command line options:
|
||||
|
@ -1462,7 +1462,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38284ms; SamplesPerSecond = 2612
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845644
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: running on localhost at 2015/10/06 11:58:27
|
||||
MPI Rank 3: command line options:
|
||||
|
@ -1937,5 +1937,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.09571s; TotalTimePerSample = 0.38284ms; SamplesPerSecond = 2612
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=3.845718
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -518,7 +518,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09126s; TotalTimePerSample = 0.36503ms; SamplesPerSecond = 2739
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.243167
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank1
|
||||
MPI Rank 1: -------------------------------------------------------------------
|
||||
|
@ -1003,7 +1003,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09356s; TotalTimePerSample = 0.37426ms; SamplesPerSecond = 2671
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.24663
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank2
|
||||
MPI Rank 2: -------------------------------------------------------------------
|
||||
|
@ -1488,7 +1488,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09356s; TotalTimePerSample = 0.37426ms; SamplesPerSecond = 2671
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.246647
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank3
|
||||
MPI Rank 3: -------------------------------------------------------------------
|
||||
|
@ -1973,5 +1973,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624377; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.09126s; TotalTimePerSample = 0.36503ms; SamplesPerSecond = 2739
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.159072; EvalErrPerSample = 0.0774; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.243121
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -518,7 +518,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13598s; TotalTimePerSample = 0.54394ms; SamplesPerSecond = 1838
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509512
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
|
||||
MPI Rank 1: -------------------------------------------------------------------
|
||||
|
@ -1003,7 +1003,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13600s; TotalTimePerSample = 0.54401ms; SamplesPerSecond = 1838
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509397
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
|
||||
MPI Rank 2: -------------------------------------------------------------------
|
||||
|
@ -1488,7 +1488,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13601s; TotalTimePerSample = 0.54403ms; SamplesPerSecond = 1838
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509323
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151006100025.151784\ParallelTraining\NoQuantization_DoublePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
|
||||
MPI Rank 3: -------------------------------------------------------------------
|
||||
|
@ -1973,5 +1973,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14604076; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13601s; TotalTimePerSample = 0.54403ms; SamplesPerSecond = 1838
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15920517; EvalErrPerSample = 0.0766; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.509212
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -496,7 +496,7 @@ MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56086s; TotalTimePerSample = 22.24344ms; SamplesPerSecond = 44
|
||||
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57143s; TotalTimePerSample = 22.28572ms; SamplesPerSecond = 44
|
||||
MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.08354
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: running on localhost at 2015/08/25 20:50:23
|
||||
MPI Rank 1: command line options:
|
||||
|
@ -955,7 +955,7 @@ MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56099s; TotalTimePerSample = 22.24397ms; SamplesPerSecond = 44
|
||||
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57339s; TotalTimePerSample = 22.29357ms; SamplesPerSecond = 44
|
||||
MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.07455
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: running on localhost at 2015/08/25 20:50:24
|
||||
MPI Rank 2: command line options:
|
||||
|
@ -1414,7 +1414,7 @@ MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.56393s; TotalTimePerSample = 22.25572ms; SamplesPerSecond = 44
|
||||
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.57187s; TotalTimePerSample = 22.28747ms; SamplesPerSecond = 44
|
||||
MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.08799
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: running on localhost at 2015/08/25 20:50:24
|
||||
MPI Rank 3: command line options:
|
||||
|
@ -1873,5 +1873,5 @@ MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20657080; EvalErr[0]PerSample = 0.11600000; TotalTime = 5.54955s; TotalTimePerSample = 22.19822ms; SamplesPerSecond = 45
|
||||
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566553; EvalErr[0]PerSample = 0.06400000; TotalTime = 5.58100s; TotalTimePerSample = 22.32401ms; SamplesPerSecond = 44
|
||||
MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15888368; EvalErrPerSample = 0.076499999; AvgLearningRatePerSample = 0.00800000038; EpochTime=222.07455
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -631,7 +631,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50281ms; SamplesPerSecond = 1988
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018144
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: running on localhost at 2015/10/24 12:44:54
|
||||
MPI Rank 1: command line:
|
||||
|
@ -1225,7 +1225,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.01855
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: running on localhost at 2015/10/24 12:44:54
|
||||
MPI Rank 2: command line:
|
||||
|
@ -1819,7 +1819,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018583
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: running on localhost at 2015/10/24 12:44:55
|
||||
MPI Rank 3: command line:
|
||||
|
@ -2413,5 +2413,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50282ms; SamplesPerSecond = 1988
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=5.018182
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -502,7 +502,7 @@ MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12883s; TotalTimePerSample = 0.51532ms; SamplesPerSecond = 1940
|
||||
MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035
|
||||
MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887408
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank1
|
||||
MPI Rank 1: -------------------------------------------------------------------
|
||||
|
@ -971,7 +971,7 @@ MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12881s; TotalTimePerSample = 0.51526ms; SamplesPerSecond = 1940
|
||||
MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12287s; TotalTimePerSample = 0.49148ms; SamplesPerSecond = 2034
|
||||
MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887381
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank2
|
||||
MPI Rank 2: -------------------------------------------------------------------
|
||||
|
@ -1440,7 +1440,7 @@ MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12898s; TotalTimePerSample = 0.51592ms; SamplesPerSecond = 1938
|
||||
MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12295s; TotalTimePerSample = 0.49182ms; SamplesPerSecond = 2033
|
||||
MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887366
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825180808.217636\ParallelTraining\NoQuantization_SinglePrecision@debug_cpu/stderr_SimpleMultiGPU.logrank3
|
||||
MPI Rank 3: -------------------------------------------------------------------
|
||||
|
@ -1909,5 +1909,5 @@ MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 371- 380]: SamplesSeen = 250; TrainLossP
|
|||
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 381- 390]: SamplesSeen = 250; TrainLossPerSample = 0.20619434; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12882s; TotalTimePerSample = 0.51529ms; SamplesPerSecond = 1940
|
||||
MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 391- 400]: SamplesSeen = 250; TrainLossPerSample = 0.14624365; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12279s; TotalTimePerSample = 0.49116ms; SamplesPerSecond = 2036
|
||||
MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15907188; EvalErrPerSample = 0.077399999; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.887371
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -636,7 +636,7 @@ MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996
|
||||
MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931563
|
||||
MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 0: COMPLETED
|
||||
MPI Rank 0: __COMPLETED__
|
||||
MPI Rank 0: ~MPIWrapper
|
||||
MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
|
||||
MPI Rank 1: -------------------------------------------------------------------
|
||||
|
@ -1239,7 +1239,7 @@ MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980
|
||||
MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931591
|
||||
MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 1: COMPLETED
|
||||
MPI Rank 1: __COMPLETED__
|
||||
MPI Rank 1: ~MPIWrapper
|
||||
MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
|
||||
MPI Rank 2: -------------------------------------------------------------------
|
||||
|
@ -1842,7 +1842,7 @@ MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12606s; TotalTimePerSample = 0.50426ms; SamplesPerSecond = 1983
|
||||
MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931381
|
||||
MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 2: COMPLETED
|
||||
MPI Rank 2: __COMPLETED__
|
||||
MPI Rank 2: ~MPIWrapper
|
||||
MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
|
||||
MPI Rank 3: -------------------------------------------------------------------
|
||||
|
@ -2445,5 +2445,5 @@ MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; Trai
|
|||
MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985
|
||||
MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; AvgLearningRatePerSample = 0.00800000038; EpochTime=4.931393
|
||||
MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
|
||||
MPI Rank 3: COMPLETED
|
||||
MPI Rank 3: __COMPLETED__
|
||||
MPI Rank 3: ~MPIWrapper
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -11,5 +11,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1 +1 @@
|
|||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9,5 +9,5 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
|
|
|
@ -2533,4 +2533,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9591762
|
||||
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9591762 Perplexity = 1052.766
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1075,9 +1075,11 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
|
||||
|
||||
Allocating matrices for forward and/or backward propagation.
|
||||
|
||||
Minibatch[1-32]: SamplesSeen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9611959
|
||||
Final Results: Minibatch[1-32]: SamplesSeen = 500 Err: ErrorPrediction/Sample = 0.998 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9611959 Perplexity = 1054.8943
|
||||
|
||||
Action "test" complete.
|
||||
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
||||
|
|
|
@ -2981,4 +2981,4 @@ CUDA error 11 [c:\tools\cub-1.4.1\cub\device\dispatch/dispatch_radix_sort.cuh, 7
|
|||
CUDA error 11 [c:\tools\cub-1.4.1\cub\device\dispatch/dispatch_radix_sort.cuh, 796]: invalid argument
|
||||
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9566009
|
||||
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.992 CE: CrossEntropyWithSoftmax/Sample = 6.9566009 Perplexity = 1050.0582
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -2533,4 +2533,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9640379
|
||||
Final Results: Minibatch[1-32]: Samples Seen = 500 Err: ErrorPrediction/Sample = 1 errTop5: ErrorPrediction/Sample = 0.996 CE: CrossEntropyWithSoftmax/Sample = 6.9640379 Perplexity = 1057.8967
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -8,24 +8,24 @@ tags:
|
|||
testCases:
|
||||
CNTK Run must be completed:
|
||||
patterns:
|
||||
- ^COMPLETED
|
||||
- __COMPLETED__
|
||||
|
||||
Must train epochs in exactly same order and parameters:
|
||||
patterns:
|
||||
- ^Starting Epoch {{integer}}
|
||||
- Starting Epoch {{integer}}
|
||||
- learning rate per sample = {{float}}
|
||||
- momentum = {{float}}
|
||||
|
||||
Epochs must be finished with expected results:
|
||||
patterns:
|
||||
- ^Finished Epoch[{{integer}} of {{integer}}]
|
||||
- Finished Epoch[{{integer}} of {{integer}}]
|
||||
- TrainLossPerSample = {{float,tolerance=.2%}}
|
||||
- EvalErrPerSample = {{float,tolerance=.2%}}
|
||||
- AvgLearningRatePerSample = {{float,tolerance=0.001%}}
|
||||
|
||||
Per-minibatch training results must match:
|
||||
patterns:
|
||||
- ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
|
||||
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
|
||||
- SamplesSeen = {{integer}}
|
||||
- TrainLossPerSample = {{float,tolerance=.2%}}
|
||||
|
||||
|
|
|
@ -913,7 +913,7 @@ already there from last epoch
|
|||
RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 38, 46, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30271576 Perplexity = 1.3535297
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
|
||||
|
@ -1734,4 +1734,4 @@ already there from last epoch
|
|||
RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 38, 46, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31798401 Perplexity = 1.3743543
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -915,7 +915,7 @@ RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
|
|||
RandomOrdering: recached sequence for seed 0: 38, 46, ...
|
||||
MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30270519 Perplexity = 1.3535154
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
-------------------------------------------------------------------
|
||||
|
@ -1737,4 +1737,4 @@ RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
|
|||
RandomOrdering: recached sequence for seed 0: 38, 46, ...
|
||||
MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31781933 Perplexity = 1.374128
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1468,7 +1468,7 @@ already there from last epoch
|
|||
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
randomordering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.31800278 Perplexity = 1.3743801
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
-------------------------------------------------------------------
|
||||
|
@ -2844,4 +2844,4 @@ already there from last epoch
|
|||
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
randomordering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.33039909 Perplexity = 1.3915234
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -910,7 +910,7 @@ already there from last epoch
|
|||
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111851 Perplexity = 1.3379231
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
-------------------------------------------------------------------
|
||||
|
@ -1727,4 +1727,4 @@ already there from last epoch
|
|||
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440025 Perplexity = 1.3558116
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -1410,7 +1410,7 @@ already there from last epoch
|
|||
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
randomordering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.31800278 Perplexity = 1.3743801
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
-------------------------------------------------------------------
|
||||
|
@ -2728,4 +2728,4 @@ already there from last epoch
|
|||
randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
randomordering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.33039909 Perplexity = 1.3915234
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -910,7 +910,7 @@ already there from last epoch
|
|||
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111847 Perplexity = 1.3379231
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
=== Deleting last epoch data
|
||||
==== Re-running from checkpoint
|
||||
-------------------------------------------------------------------
|
||||
|
@ -1727,4 +1727,4 @@ already there from last epoch
|
|||
RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
|
||||
RandomOrdering: recached sequence for seed 0: 15, 33, ...
|
||||
Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440022 Perplexity = 1.3558116
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче