From 3437d12134893dd7b45737e422e105e511341297 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 23 Feb 2021 17:42:25 -0800 Subject: [PATCH] [Trainer/Deepspeed] handle get_last_lr() before first step() (#10362) * handle get_last_lr() before first step() * abstract away the lr getting logic * cleanup * add test * move to utils --- examples/tests/deepspeed/test_deepspeed.py | 25 ++++++++++++++++++++++ src/transformers/trainer.py | 9 +++----- src/transformers/trainer_pt_utils.py | 24 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/examples/tests/deepspeed/test_deepspeed.py b/examples/tests/deepspeed/test_deepspeed.py index cb8192b75..3e9f387e6 100644 --- a/examples/tests/deepspeed/test_deepspeed.py +++ b/examples/tests/deepspeed/test_deepspeed.py @@ -78,6 +78,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus): trainer.train() assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none" + def test_early_get_last_lr(self): + # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may + # not run for the first few dozen steps while loss scale is too large, and thus during + # that time `get_last_lr` will fail if called during that warm up stage, + # + # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls + # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. + with mockenv_context(**self.dist_env_1_gpu): + a = b = 0.0 + trainer = get_regression_trainer( + a=a, + b=b, + local_rank=0, + train_len=8, + deepspeed=self.ds_config_file, + per_device_train_batch_size=8, + logging_steps=1, + ) + trainer.train() + no_grad_accum_a = trainer.model.a.item() + + # it's enough that train didn't fail for this test, but we must check that + # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) + self.assertEqual(no_grad_accum_a, a) + def test_gradient_accumulation(self): # this test measures that we get identical weights and similar loss with: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 962a6fb1b..5e805b62d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -82,6 +82,7 @@ from .trainer_pt_utils import ( SequentialDistributedSampler, distributed_broadcast_scalars, distributed_concat, + get_learning_rate, nested_concat, nested_detach, nested_numpify, @@ -1129,12 +1130,8 @@ class Trainer: tr_loss -= tr_loss logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) - # backward compatibility for pytorch schedulers - logs["learning_rate"] = ( - self.lr_scheduler.get_last_lr()[0] - if version.parse(torch.__version__) >= version.parse("1.4") - else self.lr_scheduler.get_lr()[0] - ) + logs["learning_rate"] = get_learning_rate(self) + self._total_loss_scalar += tr_loss_scalar self._globalstep_last_logged = self.state.global_step diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 1a406eb00..ce4d400cc 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -24,6 +24,7 @@ from typing import Iterator, List, Optional, Union import numpy as np import torch +from packaging import version from torch.utils.data.dataset import Dataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler @@ -262,6 +263,29 @@ def _get_first_shape(arrays): return arrays.shape +def get_learning_rate(trainer): + if trainer.deepspeed: + # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may + # not run for the first few dozen steps while loss scale is too large, and thus during + # that time `get_last_lr` will fail if called during that warm up stage, so work around it: + try: + last_lr = trainer.lr_scheduler.get_last_lr()[0] + except AssertionError as e: + if "need to call step" in str(e): + logger.warn("tried to get lr value before scheduler/optimizer started stepping, returning lr=0") + last_lr = 0 + else: + raise + else: + last_lr = ( + # backward compatibility for pytorch schedulers + trainer.lr_scheduler.get_last_lr()[0] + if version.parse(torch.__version__) >= version.parse("1.4") + else trainer.lr_scheduler.get_lr()[0] + ) + return last_lr + + class DistributedTensorGatherer: """ A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.