Fix gradual wamup. enable dataset specific toy mode batch sizes, imagenet toy mode working, disable decay_bn for now

2020-04-25 07:23:33 -07:00 · 2020-04-25 07:23:33 -07:00 · 38f921cded
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -62,7 +62,7 @@
            "args": ["--algos", "darts", "--datasets", "food101"]
        },
        {
-            "name": "Darts-ImageNet-Toy",
+            "name": "Darts-ImageNet-Eval-Toy",
            "type": "python",
            "request": "launch",
            "program": "${cwd}/scripts/main.py",
--- a/archai/common/ml_utils.py
+++ b/archai/common/ml_utils.py
@ -5,7 +5,6 @@ import math
 import  torch
 from torch import nn
 from torch.optim import lr_scheduler, SGD, Adam
-from warmup_scheduler import GradualWarmupScheduler
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.optim.optimizer import Optimizer
 from torch.nn.modules.loss import _WeightedLoss, _Loss
@ -16,6 +15,7 @@ import statopt
 from .config import Config
 from .cocob import CocobBackprop
 from .ml_losses import SmoothCrossEntropyLoss
+from .warmup_scheduler import GradualWarmupScheduler


 def create_optimizer(conf_opt:Config, params)->Optimizer:
--- a/archai/common/warmup_scheduler.py
+++ b/archai/common/warmup_scheduler.py
@ -0,0 +1,67 @@
+
+# Credits: https://github.com/ildoonet/pytorch-gradual-warmup-lr
+
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+
+
+class GradualWarmupScheduler(_LRScheduler):
+    """ Gradually warm-up(increasing) learning rate in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
+        total_epoch: target learning rate is reached at total_epoch, gradually
+        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    """
+
+    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
+        self.multiplier = multiplier
+        if self.multiplier < 1.:
+            raise ValueError(f'multiplier should be >= 1 but was {self.multiplier}')
+        self.total_epoch = total_epoch
+        self.after_scheduler = after_scheduler
+        self.finished = False
+        super(GradualWarmupScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        if self.last_epoch > self.total_epoch:
+            if self.after_scheduler:
+                if not self.finished:
+                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
+                    self.finished = True
+                return self.after_scheduler.get_lr()
+            return [base_lr * self.multiplier for base_lr in self.base_lrs]
+
+        if self.multiplier == 1.0:
+            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
+        else:
+            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+
+    def step_ReduceLROnPlateau(self, metrics, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
+        if self.last_epoch <= self.total_epoch:
+            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
+                param_group['lr'] = lr
+        else:
+            if epoch is None:
+                self.after_scheduler.step(metrics, None)
+            else:
+                self.after_scheduler.step(metrics, epoch - self.total_epoch)
+
+    def step(self, epoch=None, metrics=None):
+        if type(self.after_scheduler) != ReduceLROnPlateau:
+            if self.finished and self.after_scheduler:
+                if epoch is None:
+                    self.after_scheduler.step(None)
+                else:
+                    self.after_scheduler.step(epoch - self.total_epoch)
+                self._last_lr = self.after_scheduler.get_last_lr()
+            else:
+                return super(GradualWarmupScheduler, self).step(epoch)
+        else:
+            self.step_ReduceLROnPlateau(metrics, epoch)
--- a/confs/algos/darts.yaml
+++ b/confs/algos/darts.yaml
@ -10,7 +10,10 @@ common:
  checkpoint:
    filename: '$expdir/checkpoint.pth'
    freq: 10
-
+  toy_mode: # this section will be used by toy.yaml to setup the toy mode
+    max_batches: 4
+    train_batch: 32
+    test_batch: 64
  # TODO: workers setting

  # reddis address of Ray cluster. Use None for single node run
--- a/confs/datasets/imagenet.yaml
+++ b/confs/datasets/imagenet.yaml
@ -3,6 +3,10 @@ common:
  seed: 0.0
  apex:
    loss_scale: "128" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
+  toy_mode: # this section will be used by toy.yaml to setup the toy mode
+    max_batches: 25
+    train_batch: 64
+    test_batch: 64

 dataset_eval:
  name: 'imagenet'
@ -63,11 +67,11 @@ nas:
      optimizer:
        lr: 2.048 # init learning rate
        decay: 3.05e-5
-        decay_bn: 0.0 # if NaN then same as decay otherwise apply different decay to BN layers
+        decay_bn: .NaN # if .NaN then same as decay otherwise apply different decay to BN layers
        momentum: 0.875 # pytorch default is 0.0
      lr_schedule:
        type: 'cosine'
        min_lr: 0.0 # min learning rate to se bet in eta_min param of scheduler
        warmup:  # increases LR for 0 to current in specified epochs and then hands over to main scheduler
-          multiplier: 1
+          multiplier: 1.0
          epochs: 8
--- a/confs/toy.yaml
+++ b/confs/toy.yaml
@ -17,29 +17,32 @@ nas:
      trainer:
        epochs: 0 # number of epochs model will be trained before search
      loader:
-        train_batch: 32
-        test_batch: 64
+        train_batch: '_copy: common/toy_mode/train_batch'
+        test_batch: '_copy: common/toy_mode/test_batch'
        dataset:
-          max_batches: 4
+          max_batches: '_copy: common/toy_mode/max_batches'
    post_train:
      trainer:
        epochs: 1
      loader:
-        train_batch: 32
-        test_batch: 64
+        train_batch: '_copy: common/toy_mode/train_batch'
+        test_batch: '_copy: common/toy_mode/test_batch'
        dataset:
-          max_batches: 4
+          max_batches: '_copy: common/toy_mode/max_batches'
    model_desc:
      n_reductions: 1 # number of reductions to be applied
      n_cells: 3 # number of cells
      n_nodes: 2 # number of nodes in a cell
    loader:
-      train_batch: 32
-      test_batch: 64
+      train_batch: '_copy: common/toy_mode/train_batch'
+      test_batch: '_copy: common/toy_mode/test_batch'
      dataset:
-        max_batches: 4
+        max_batches: '_copy: common/toy_mode/max_batches'
    trainer:
      epochs: 1
+      logger_freq: 1
+      validation:
+        logger_freq: 1
  eval:
    data_parallel: False
    checkpoint: null
@ -48,9 +51,13 @@ nas:
      n_nodes: 4 # number of nodes in a cell
      n_reductions: 2 # number of reductions to be applied
    loader:
-      train_batch: 32
-      test_batch: 64
+      train_batch: '_copy: common/toy_mode/train_batch'
+      test_batch: '_copy: common/toy_mode/test_batch'
      dataset:
-        max_batches: 2
+        max_batches: '_copy: common/toy_mode/max_batches'
    trainer:
      epochs: 1
+      logger_freq: 1
+      validation:
+        logger_freq: 1
+