BMUF metrics loss avg correction

2017-11-22 14:43:35 -08:00 · 2017-11-22 14:43:35 -08:00 · f31a944eb8
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit 5f613de6d19146e667dc200335b3a357119edf17
+Subproject commit 70d0c096a11e3b370d1b529f098f9711a7966090
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -5048,6 +5048,15 @@ namespace CNTK
            return 1;
        }

+        //
+        // Method to do loss and eval metrics aggregation across workers before summarization. 
+        // Eg BlockMomentumUpdateAndFiltering BMUF needs an aggregation of metrics. 
+        // Arguments are local training loss value and local eval criterion value.
+        //
+        virtual void DoAggregateMetricsIfNeeded(NDArrayViewPtr&, NDArrayViewPtr&)
+        {
+        }
+
    protected:
        DistributedLearner(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples)
            : Learner(learner? learner->Parameters() : std::vector<Parameter>(),
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -290,6 +290,24 @@ namespace CNTK

    void Trainer::SummarizeTrainingProgress()
    {
+        // Aggregate across workers training loss and eval criteria. Needed for BMUF like learner which don't aggregate after every minibatch.
+        if (m_parameterLearners->DoAggregateMetricsIfNeededLambda)
+        {
+            NDArrayViewPtr localLossValue = nullptr;
+            if (m_aggregatedTrainingLossValue && m_aggregatedTrainingLossValue->IsInitialized())
+            {
+                localLossValue = m_aggregatedTrainingLossValue->Data();
+            }
+
+            NDArrayViewPtr localEvalCriterion = nullptr;
+            if (m_aggregatedTrainingEvalCriterionValue && m_aggregatedTrainingEvalCriterionValue->IsInitialized())
+            {
+                localEvalCriterion = m_aggregatedTrainingEvalCriterionValue->Data();
+            }
+
+            m_parameterLearners->DoAggregateMetricsIfNeededLambda(localLossValue, localEvalCriterion);
+        }
+
        for (auto& progressWriter : m_progressWriters)
        {
            progressWriter->WriteTrainingSummary(m_aggregatedTrainingLossValue, m_aggregatedTrainingEvalCriterionValue);
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -991,7 +991,8 @@ namespace CNTK

    Learners::Learners(const std::vector<LearnerPtr>& learners) :
        m_learners(learners),
-        m_isDistributed(false)
+        m_isDistributed(false),
+        DoAggregateMetricsIfNeededLambda(nullptr)
    {
        if (learners.empty())
            InvalidArgument("These must be at least one learner.");
@ -999,9 +1000,19 @@ namespace CNTK
        std::unordered_set<Parameter> learnerParameters;
        for (const auto& learner : m_learners)
        {
-            if (dynamic_pointer_cast<DistributedLearner>(learner) != nullptr)
+            DistributedLearnerPtr distLearner = dynamic_pointer_cast<DistributedLearner>(learner);
+            if (distLearner)
+            {
                m_isDistributed = true;

+                // When only 1 distributed learner is present, enable the lambda. This is used to correctly report loss and eval in BMUF learner case.
+                // Todo : Reconsider this design of working with only 1 distributed learner.
+                if (m_learners.size() == 1)
+                {
+                    DoAggregateMetricsIfNeededLambda = std::bind(&DistributedLearner::DoAggregateMetricsIfNeeded, distLearner, std::placeholders::_1, std::placeholders::_2);
+                }
+            }
+
            const auto& currentLearnerParameters = learner->Parameters();
            for (const auto& parameter : currentLearnerParameters)
            {
@ -1117,7 +1128,7 @@ namespace CNTK
            InvalidArgument("Attempting to accumulate a null Value.");

        bool copied = false;
-        if (m_isUninitialized ||
+        if (!m_isInitialized ||
            GetDataType() != delta->GetDataType() ||
            Shape() != delta->Shape() ||
            Device() != device ||
@ -1127,7 +1138,7 @@ namespace CNTK
            m_data = MakeSharedObject<NDArrayView>(delta->GetDataType(), delta->Shape(), device);
            m_mask = delta->Mask();
            ResetToZero();
-            m_isUninitialized = false;
+            m_isInitialized = true;
        }

        if (delta->GetDataType() == DataType::Float)
@ -1149,7 +1160,7 @@ namespace CNTK

    void Accumulator::ResetToZero()
    {
-        if (m_isUninitialized)
+        if (!m_isInitialized)
            return;

        if (GetDataType() == DataType::Float)
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -646,6 +646,8 @@ namespace CNTK
            return m_isDistributed;
        }

+        std::function<void(NDArrayViewPtr&, NDArrayViewPtr&)> DoAggregateMetricsIfNeededLambda;
+        
    private:
        void GetLearnerGradients(LearnerPtr learner, const std::unordered_map<Parameter, NDArrayViewPtr>& allGradients, std::unordered_map<Parameter, NDArrayViewPtr>& learnerGradients);
        void CheckDistributedLearners();
@ -713,15 +715,15 @@ namespace CNTK
    class Accumulator : public Value
    {
    public:
-        Accumulator() : Value(nullptr), m_numUpdates(0), m_isUninitialized(true) {}
+        Accumulator() : Value(nullptr), m_numUpdates(0), m_isInitialized(false) {}

        void Update(const ValuePtr& delta, const DeviceDescriptor& device);
        void Reset();
-
+        bool IsInitialized() { return m_isInitialized; }
    private:
        void ResetToZero();

-        bool m_isUninitialized;
+        bool m_isInitialized;
        size_t   m_numUpdates;
    };

--- a/Tests/EndToEndTests/CNTKv2Library/EndToEndTests/V2LibraryEndToEndTests.vcxproj
+++ b/Tests/EndToEndTests/CNTKv2Library/EndToEndTests/V2LibraryEndToEndTests.vcxproj
@ -87,7 +87,8 @@
      <WarningLevel>Level4</WarningLevel>
      <PrecompiledHeader>
      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">Disabled</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@ -130,4 +131,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/bindings/python/cntk/learners/tests/bmuf_metrics_aggregation_test.py
+++ b/bindings/python/cntk/learners/tests/bmuf_metrics_aggregation_test.py
@ -0,0 +1,194 @@
+"""
+This tests the metrics averaging functionality in BMUF. All workers should be reporting the same loss and eval metrics.
+"""
+import pytest
+import cntk
+import numpy as np
+import sys, os
+sys.path.append(os.path.dirname(__file__))
+from distributed_learner_test import mpiexec_execute
+import argparse
+import re
+import platform
+
+cntk.cntk_py.set_fixed_random_seed(1)
+#cntk.logging.set_trace_level(cntk.logging.TraceLevel.Info)
+
+feat_dim = 5
+label_dim = 3
+cell_dim = 5
+seq_len = 20
+num_batches = 101
+batch_size = 10
+progress_freq =10
+NUM_WORKERS = 4
+
+class SimpleBMUFTrainer():
+    def __init__(self, frame_mode=False):
+        self.create_model(frame_mode)
+        self.create_trainer()
+    
+    def create_model(self, frame_mode=False):
+        if frame_mode:
+            self.feat = cntk.input_variable(shape=(feat_dim,))
+            self.label = cntk.input_variable((label_dim,))
+            
+            net = cntk.layers.Sequential([cntk.layers.Dense(cell_dim), cntk.layers.Dense(label_dim)])
+            self.output = net(self.feat)
+        else:    
+            #sequence mode
+            self.feat = cntk.sequence.input_variable(shape=(feat_dim,))
+            self.label = cntk.sequence.input_variable((label_dim,))
+            
+            net = cntk.layers.Sequential([cntk.layers.Recurrence(cntk.layers.LSTM(shape=label_dim, cell_shape=(cell_dim,)))])
+            self.output = net(self.feat)
+        
+        self.ce = cntk.cross_entropy_with_softmax(self.output, self.label)
+        self.err = cntk.classification_error(self.output, self.label)
+    
+    def create_trainer(self):
+        learner = cntk.block_momentum_distributed_learner(cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), 
+                                                          block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000)
+        
+        comm_rank = cntk.distributed.Communicator.rank()
+        self.trainer = cntk.Trainer(self.output, (self.ce, self.err), [learner], [cntk.logging.ProgressPrinter(freq=progress_freq, tag="Training", rank=comm_rank)])
+
+def get_minibatch(bmuf, working_dir, mb_source):
+    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
+    
+    if mb_source == "numpy":
+        for i in range(num_batches):
+            features = []
+            labels = []
+            for j in range(batch_size):
+                seq_len_j = [seq_len, seq_len + 5, seq_len - 5][j % 3]
+                x = np.random.rand( seq_len_j, feat_dim).astype(np.float32)
+                y = np.random.rand( seq_len_j, label_dim).astype(np.float32)
+                features.append(x)    
+                labels.append(y)
+            yield {bmuf.feat: features, bmuf.label: labels}
+    
+    if mb_source in ("ctf_utterance", "ctf_frame", "ctf_bptt"):
+        if mb_source == "ctf_frame":
+            #frame mode data without sequence ids.
+            ctf_data = ctf_data = '''\
+|S0 0.49  0.18  0.84  0.7   0.59 |S1 0.12  0.24  0.14
+|S0 0.69  0.63  0.47  0.93  0.69 |S1 0.34  0.85  0.17
+|S0 0.04  0.5   0.39  0.86  0.28 |S1 0.62  0.36  0.53
+|S0 0.71  0.9   0.15  0.83  0.18 |S1 0.2   0.74  0.04
+|S0 0.38  0.67  0.46  0.53  0.75 |S1 0.6   0.14  0.35
+|S0 0.94  0.54  0.09  0.55  0.08 |S1 0.07  0.53  0.47
+|S0 0.11  0.24  0.17  0.72  0.72 |S1 0.9   0.98  0.18
+|S0 0.3   1.    0.34  0.06  0.78 |S1 0.15  0.69  0.63
+|S0 0.69  0.86  0.59  0.49  0.99 |S1 0.13  0.6   0.21
+'''
+        #sequence mode data with sequence id
+        else:
+            ctf_data = ctf_data = '''\
+0	|S0 0.49  0.18  0.84  0.7   0.59 |S1 0.12  0.24  0.14
+0	|S0 0.69  0.63  0.47  0.93  0.69 |S1 0.34  0.85  0.17
+0	|S0 0.04  0.5   0.39  0.86  0.28 |S1 0.62  0.36  0.53
+0	|S0 0.71  0.9   0.15  0.83  0.18 |S1 0.2   0.74  0.04
+0	|S0 0.38  0.67  0.46  0.53  0.75 |S1 0.6   0.14  0.35
+0	|S0 0.94  0.54  0.09  0.55  0.08 |S1 0.07  0.53  0.47
+0	|S0 0.11  0.24  0.17  0.72  0.72 |S1 0.9   0.98  0.18
+2	|S0 0.3   1.    0.34  0.06  0.78 |S1 0.15  0.69  0.63
+2	|S0 0.69  0.86  0.59  0.49  0.99 |S1 0.13  0.6   0.21
+'''
+
+        ctf_file = os.path.join(working_dir, '2seqtest.txt')
+        with open(ctf_file, 'w') as f:
+            f.write(ctf_data)
+    
+        # ctf_utterance model
+        frame_mode = False
+        truncation_length = 0
+        
+        if mb_source == "ctf_frame":
+            frame_mode = True
+        elif mb_source == "ctf_bptt":
+            truncation_length = 2
+            
+        mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
+            features  = StreamDef(field='S0', shape=feat_dim,  is_sparse=False),
+            labels    = StreamDef(field='S1', shape=label_dim,  is_sparse=False)
+        )), randomize=False, max_samples = batch_size*num_batches, 
+            frame_mode=frame_mode, truncation_length=truncation_length)
+        
+        for i in range(num_batches):
+            minibatch = mbs.next_minibatch(batch_size, {bmuf.feat: mbs.streams.features, bmuf.label: mbs.streams.labels})
+            if not minibatch:
+                break
+            yield minibatch
+    
+def mpi_worker(working_dir, mb_source, gpu):
+    comm_rank = cntk.distributed.Communicator.rank()
+    np.random.seed(comm_rank)
+    
+    if gpu:
+        # test with only one GPU
+        cntk.try_set_default_device(cntk.gpu(0))
+        
+    frame_mode = (mb_source == "ctf_frame")
+    bmuf = SimpleBMUFTrainer(frame_mode)
+    for i, data in enumerate(get_minibatch(bmuf, working_dir, mb_source)):        
+        bmuf.trainer.train_minibatch(data)        
+        if i % 50 == 0:
+            bmuf.trainer.summarize_training_progress()       
+
+MB_SOURCES = ["numpy", "ctf_utterance", "ctf_frame", "ctf_bptt"]
+#MB_SOURCES = ["numpy"]    
+@pytest.mark.parametrize("mb_source", MB_SOURCES)
+def test_bmuf_correct_metrics_averaging(tmpdir, device_id, mb_source):
+    if platform.system() == 'Linux':
+        pytest.skip('test only runs on Windows due to mpiexec -l option')
+        
+    launch_args = []
+    if device_id >= 0:
+        launch_args += ['--gpu']
+        
+    launch_args += ["--outputdir", str(tmpdir)]
+    launch_args += ["--mb_source", mb_source]
+    
+    ret_str = mpiexec_execute(__file__, ['-n', str(NUM_WORKERS), '-l'], launch_args)
+    #print(ret_str)
+    
+    # [0]Finished Epoch[1]: [Training] loss = 1.663636 * 10, metric = 52.40% * 10 0.890s ( 11.2 samples/s);
+    regex_pattern = r"\[(?P<worker_rank>\d)\].*? Epoch\[(?P<epoch>\d+)\].*? loss = (?P<loss>\d+\.\d+) .*? metric = (?P<metric>\d+\.\d+)"
+    loss_perepoch_perworker = {i:{} for i in range(NUM_WORKERS)}
+    for match in re.finditer(regex_pattern, ret_str):
+        rank = int(match.groupdict()["worker_rank"])
+        epoch = int(match.groupdict()["epoch"])
+        loss = match.groupdict()["loss"]
+        metric = match.groupdict()["metric"]
+        loss_perepoch_perworker[rank].update({epoch:(loss, metric)})
+       
+    num_epochs_per_worker = list(map(len,loss_perepoch_perworker.values()))
+    
+    #assert that data exists
+    assert len(num_epochs_per_worker) != 0
+    
+    #assert that number of epochs isn't zero for 1st worker.
+    assert num_epochs_per_worker[0] != 0
+    
+    # assert all workers have same number of epochs
+    assert min(num_epochs_per_worker) == max(num_epochs_per_worker)
+    
+    # assert all workers have same loss and metric values
+    loss_per_worker = loss_perepoch_perworker.values()
+    loss_per_worker_epochsort = []
+    for epoch_losses in loss_per_worker:
+        loss_per_worker_epochsort.append([epoch_losses[i] for i in sorted(epoch_losses)])
+        
+    assert all([loss_per_worker_epochsort[0] == i for i in loss_per_worker_epochsort])
+       
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-outputdir', '--outputdir')
+    parser.add_argument('-mb_source', '--mb_source')
+    parser.add_argument('-gpu', '--gpu', action='store_true')
+    args = vars(parser.parse_args())
+    
+    mpi_worker(args["outputdir"], args["mb_source"], args["gpu"])    
+    cntk.distributed.Communicator.finalize()