From 8672bcda1f9120f456c69584ee32bbede1d87b02 Mon Sep 17 00:00:00 2001 From: CeShine Lee Date: Mon, 1 Feb 2021 21:07:33 +0800 Subject: [PATCH] Adafactor: avoid updating group["lr"] attributes (#9751) This affects Adafactor with relative_step=False and scale_parameter=True. Updating group["lr"] makes the result of ._get_lr() depends on the previous call, i.e., on the scale of other parameters. This isn't supposed to happen. --- src/transformers/optimization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 9e65710d2..e9fee7fda 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -546,7 +546,7 @@ class Adafactor(Optimizer): state["step"] += 1 state["RMS"] = self._rms(p_data_fp32) - group["lr"] = self._get_lr(group, state) + lr = self._get_lr(group, state) beta2t = 1.0 - math.pow(state["step"], group["decay_rate"]) update = (grad ** 2) + group["eps"][0] @@ -567,7 +567,7 @@ class Adafactor(Optimizer): update = exp_avg_sq.rsqrt().mul_(grad) update.div_((self._rms(update) / group["clip_threshold"]).clamp_(min=1.0)) - update.mul_(group["lr"]) + update.mul_(lr) if use_first_moment: exp_avg = state["exp_avg"] @@ -575,7 +575,7 @@ class Adafactor(Optimizer): update = exp_avg if group["weight_decay"] != 0: - p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32) + p_data_fp32.add_(-group["weight_decay"] * lr, p_data_fp32) p_data_fp32.add_(-update)