BMUF metrics loss avg correction

This commit is contained in:
Mudit Jain 2017-11-22 14:43:35 -08:00
Родитель 6d539bbbc4
Коммит f31a944eb8
7 изменённых файлов: 246 добавлений и 11 удалений

@ -1 +1 @@
Subproject commit 5f613de6d19146e667dc200335b3a357119edf17
Subproject commit 70d0c096a11e3b370d1b529f098f9711a7966090

Просмотреть файл

@ -5048,6 +5048,15 @@ namespace CNTK
return 1;
}
//
// Method to do loss and eval metrics aggregation across workers before summarization.
// Eg BlockMomentumUpdateAndFiltering BMUF needs an aggregation of metrics.
// Arguments are local training loss value and local eval criterion value.
//
virtual void DoAggregateMetricsIfNeeded(NDArrayViewPtr&, NDArrayViewPtr&)
{
}
protected:
DistributedLearner(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples)
: Learner(learner? learner->Parameters() : std::vector<Parameter>(),

Просмотреть файл

@ -290,6 +290,24 @@ namespace CNTK
void Trainer::SummarizeTrainingProgress()
{
// Aggregate across workers training loss and eval criteria. Needed for BMUF like learner which don't aggregate after every minibatch.
if (m_parameterLearners->DoAggregateMetricsIfNeededLambda)
{
NDArrayViewPtr localLossValue = nullptr;
if (m_aggregatedTrainingLossValue && m_aggregatedTrainingLossValue->IsInitialized())
{
localLossValue = m_aggregatedTrainingLossValue->Data();
}
NDArrayViewPtr localEvalCriterion = nullptr;
if (m_aggregatedTrainingEvalCriterionValue && m_aggregatedTrainingEvalCriterionValue->IsInitialized())
{
localEvalCriterion = m_aggregatedTrainingEvalCriterionValue->Data();
}
m_parameterLearners->DoAggregateMetricsIfNeededLambda(localLossValue, localEvalCriterion);
}
for (auto& progressWriter : m_progressWriters)
{
progressWriter->WriteTrainingSummary(m_aggregatedTrainingLossValue, m_aggregatedTrainingEvalCriterionValue);

Просмотреть файл

@ -991,7 +991,8 @@ namespace CNTK
Learners::Learners(const std::vector<LearnerPtr>& learners) :
m_learners(learners),
m_isDistributed(false)
m_isDistributed(false),
DoAggregateMetricsIfNeededLambda(nullptr)
{
if (learners.empty())
InvalidArgument("These must be at least one learner.");
@ -999,9 +1000,19 @@ namespace CNTK
std::unordered_set<Parameter> learnerParameters;
for (const auto& learner : m_learners)
{
if (dynamic_pointer_cast<DistributedLearner>(learner) != nullptr)
DistributedLearnerPtr distLearner = dynamic_pointer_cast<DistributedLearner>(learner);
if (distLearner)
{
m_isDistributed = true;
// When only 1 distributed learner is present, enable the lambda. This is used to correctly report loss and eval in BMUF learner case.
// Todo : Reconsider this design of working with only 1 distributed learner.
if (m_learners.size() == 1)
{
DoAggregateMetricsIfNeededLambda = std::bind(&DistributedLearner::DoAggregateMetricsIfNeeded, distLearner, std::placeholders::_1, std::placeholders::_2);
}
}
const auto& currentLearnerParameters = learner->Parameters();
for (const auto& parameter : currentLearnerParameters)
{
@ -1117,7 +1128,7 @@ namespace CNTK
InvalidArgument("Attempting to accumulate a null Value.");
bool copied = false;
if (m_isUninitialized ||
if (!m_isInitialized ||
GetDataType() != delta->GetDataType() ||
Shape() != delta->Shape() ||
Device() != device ||
@ -1127,7 +1138,7 @@ namespace CNTK
m_data = MakeSharedObject<NDArrayView>(delta->GetDataType(), delta->Shape(), device);
m_mask = delta->Mask();
ResetToZero();
m_isUninitialized = false;
m_isInitialized = true;
}
if (delta->GetDataType() == DataType::Float)
@ -1149,7 +1160,7 @@ namespace CNTK
void Accumulator::ResetToZero()
{
if (m_isUninitialized)
if (!m_isInitialized)
return;
if (GetDataType() == DataType::Float)

Просмотреть файл

@ -646,6 +646,8 @@ namespace CNTK
return m_isDistributed;
}
std::function<void(NDArrayViewPtr&, NDArrayViewPtr&)> DoAggregateMetricsIfNeededLambda;
private:
void GetLearnerGradients(LearnerPtr learner, const std::unordered_map<Parameter, NDArrayViewPtr>& allGradients, std::unordered_map<Parameter, NDArrayViewPtr>& learnerGradients);
void CheckDistributedLearners();
@ -713,15 +715,15 @@ namespace CNTK
class Accumulator : public Value
{
public:
Accumulator() : Value(nullptr), m_numUpdates(0), m_isUninitialized(true) {}
Accumulator() : Value(nullptr), m_numUpdates(0), m_isInitialized(false) {}
void Update(const ValuePtr& delta, const DeviceDescriptor& device);
void Reset();
bool IsInitialized() { return m_isInitialized; }
private:
void ResetToZero();
bool m_isUninitialized;
bool m_isInitialized;
size_t m_numUpdates;
};

Просмотреть файл

@ -87,7 +87,8 @@
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">Disabled</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@ -130,4 +131,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

Просмотреть файл

@ -0,0 +1,194 @@
"""
This tests the metrics averaging functionality in BMUF. All workers should be reporting the same loss and eval metrics.
"""
import pytest
import cntk
import numpy as np
import sys, os
sys.path.append(os.path.dirname(__file__))
from distributed_learner_test import mpiexec_execute
import argparse
import re
import platform
cntk.cntk_py.set_fixed_random_seed(1)
#cntk.logging.set_trace_level(cntk.logging.TraceLevel.Info)
feat_dim = 5
label_dim = 3
cell_dim = 5
seq_len = 20
num_batches = 101
batch_size = 10
progress_freq =10
NUM_WORKERS = 4
class SimpleBMUFTrainer():
def __init__(self, frame_mode=False):
self.create_model(frame_mode)
self.create_trainer()
def create_model(self, frame_mode=False):
if frame_mode:
self.feat = cntk.input_variable(shape=(feat_dim,))
self.label = cntk.input_variable((label_dim,))
net = cntk.layers.Sequential([cntk.layers.Dense(cell_dim), cntk.layers.Dense(label_dim)])
self.output = net(self.feat)
else:
#sequence mode
self.feat = cntk.sequence.input_variable(shape=(feat_dim,))
self.label = cntk.sequence.input_variable((label_dim,))
net = cntk.layers.Sequential([cntk.layers.Recurrence(cntk.layers.LSTM(shape=label_dim, cell_shape=(cell_dim,)))])
self.output = net(self.feat)
self.ce = cntk.cross_entropy_with_softmax(self.output, self.label)
self.err = cntk.classification_error(self.output, self.label)
def create_trainer(self):
learner = cntk.block_momentum_distributed_learner(cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)),
block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000)
comm_rank = cntk.distributed.Communicator.rank()
self.trainer = cntk.Trainer(self.output, (self.ce, self.err), [learner], [cntk.logging.ProgressPrinter(freq=progress_freq, tag="Training", rank=comm_rank)])
def get_minibatch(bmuf, working_dir, mb_source):
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
if mb_source == "numpy":
for i in range(num_batches):
features = []
labels = []
for j in range(batch_size):
seq_len_j = [seq_len, seq_len + 5, seq_len - 5][j % 3]
x = np.random.rand( seq_len_j, feat_dim).astype(np.float32)
y = np.random.rand( seq_len_j, label_dim).astype(np.float32)
features.append(x)
labels.append(y)
yield {bmuf.feat: features, bmuf.label: labels}
if mb_source in ("ctf_utterance", "ctf_frame", "ctf_bptt"):
if mb_source == "ctf_frame":
#frame mode data without sequence ids.
ctf_data = ctf_data = '''\
|S0 0.49 0.18 0.84 0.7 0.59 |S1 0.12 0.24 0.14
|S0 0.69 0.63 0.47 0.93 0.69 |S1 0.34 0.85 0.17
|S0 0.04 0.5 0.39 0.86 0.28 |S1 0.62 0.36 0.53
|S0 0.71 0.9 0.15 0.83 0.18 |S1 0.2 0.74 0.04
|S0 0.38 0.67 0.46 0.53 0.75 |S1 0.6 0.14 0.35
|S0 0.94 0.54 0.09 0.55 0.08 |S1 0.07 0.53 0.47
|S0 0.11 0.24 0.17 0.72 0.72 |S1 0.9 0.98 0.18
|S0 0.3 1. 0.34 0.06 0.78 |S1 0.15 0.69 0.63
|S0 0.69 0.86 0.59 0.49 0.99 |S1 0.13 0.6 0.21
'''
#sequence mode data with sequence id
else:
ctf_data = ctf_data = '''\
0 |S0 0.49 0.18 0.84 0.7 0.59 |S1 0.12 0.24 0.14
0 |S0 0.69 0.63 0.47 0.93 0.69 |S1 0.34 0.85 0.17
0 |S0 0.04 0.5 0.39 0.86 0.28 |S1 0.62 0.36 0.53
0 |S0 0.71 0.9 0.15 0.83 0.18 |S1 0.2 0.74 0.04
0 |S0 0.38 0.67 0.46 0.53 0.75 |S1 0.6 0.14 0.35
0 |S0 0.94 0.54 0.09 0.55 0.08 |S1 0.07 0.53 0.47
0 |S0 0.11 0.24 0.17 0.72 0.72 |S1 0.9 0.98 0.18
2 |S0 0.3 1. 0.34 0.06 0.78 |S1 0.15 0.69 0.63
2 |S0 0.69 0.86 0.59 0.49 0.99 |S1 0.13 0.6 0.21
'''
ctf_file = os.path.join(working_dir, '2seqtest.txt')
with open(ctf_file, 'w') as f:
f.write(ctf_data)
# ctf_utterance model
frame_mode = False
truncation_length = 0
if mb_source == "ctf_frame":
frame_mode = True
elif mb_source == "ctf_bptt":
truncation_length = 2
mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
features = StreamDef(field='S0', shape=feat_dim, is_sparse=False),
labels = StreamDef(field='S1', shape=label_dim, is_sparse=False)
)), randomize=False, max_samples = batch_size*num_batches,
frame_mode=frame_mode, truncation_length=truncation_length)
for i in range(num_batches):
minibatch = mbs.next_minibatch(batch_size, {bmuf.feat: mbs.streams.features, bmuf.label: mbs.streams.labels})
if not minibatch:
break
yield minibatch
def mpi_worker(working_dir, mb_source, gpu):
comm_rank = cntk.distributed.Communicator.rank()
np.random.seed(comm_rank)
if gpu:
# test with only one GPU
cntk.try_set_default_device(cntk.gpu(0))
frame_mode = (mb_source == "ctf_frame")
bmuf = SimpleBMUFTrainer(frame_mode)
for i, data in enumerate(get_minibatch(bmuf, working_dir, mb_source)):
bmuf.trainer.train_minibatch(data)
if i % 50 == 0:
bmuf.trainer.summarize_training_progress()
MB_SOURCES = ["numpy", "ctf_utterance", "ctf_frame", "ctf_bptt"]
#MB_SOURCES = ["numpy"]
@pytest.mark.parametrize("mb_source", MB_SOURCES)
def test_bmuf_correct_metrics_averaging(tmpdir, device_id, mb_source):
if platform.system() == 'Linux':
pytest.skip('test only runs on Windows due to mpiexec -l option')
launch_args = []
if device_id >= 0:
launch_args += ['--gpu']
launch_args += ["--outputdir", str(tmpdir)]
launch_args += ["--mb_source", mb_source]
ret_str = mpiexec_execute(__file__, ['-n', str(NUM_WORKERS), '-l'], launch_args)
#print(ret_str)
# [0]Finished Epoch[1]: [Training] loss = 1.663636 * 10, metric = 52.40% * 10 0.890s ( 11.2 samples/s);
regex_pattern = r"\[(?P<worker_rank>\d)\].*? Epoch\[(?P<epoch>\d+)\].*? loss = (?P<loss>\d+\.\d+) .*? metric = (?P<metric>\d+\.\d+)"
loss_perepoch_perworker = {i:{} for i in range(NUM_WORKERS)}
for match in re.finditer(regex_pattern, ret_str):
rank = int(match.groupdict()["worker_rank"])
epoch = int(match.groupdict()["epoch"])
loss = match.groupdict()["loss"]
metric = match.groupdict()["metric"]
loss_perepoch_perworker[rank].update({epoch:(loss, metric)})
num_epochs_per_worker = list(map(len,loss_perepoch_perworker.values()))
#assert that data exists
assert len(num_epochs_per_worker) != 0
#assert that number of epochs isn't zero for 1st worker.
assert num_epochs_per_worker[0] != 0
# assert all workers have same number of epochs
assert min(num_epochs_per_worker) == max(num_epochs_per_worker)
# assert all workers have same loss and metric values
loss_per_worker = loss_perepoch_perworker.values()
loss_per_worker_epochsort = []
for epoch_losses in loss_per_worker:
loss_per_worker_epochsort.append([epoch_losses[i] for i in sorted(epoch_losses)])
assert all([loss_per_worker_epochsort[0] == i for i in loss_per_worker_epochsort])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-outputdir', '--outputdir')
parser.add_argument('-mb_source', '--mb_source')
parser.add_argument('-gpu', '--gpu', action='store_true')
args = vars(parser.parse_args())
mpi_worker(args["outputdir"], args["mb_source"], args["gpu"])
cntk.distributed.Communicator.finalize()