Fix gradual wamup. enable dataset specific toy mode batch sizes, imagenet toy mode working, disable decay_bn for now

This commit is contained in:
Shital Shah 2020-04-25 07:23:33 -07:00
Родитель 74e34ff8a9
Коммит 38f921cded
6 изменённых файлов: 98 добавлений и 17 удалений

2
.vscode/launch.json поставляемый
Просмотреть файл

@ -62,7 +62,7 @@
"args": ["--algos", "darts", "--datasets", "food101"]
},
{
"name": "Darts-ImageNet-Toy",
"name": "Darts-ImageNet-Eval-Toy",
"type": "python",
"request": "launch",
"program": "${cwd}/scripts/main.py",

Просмотреть файл

@ -5,7 +5,6 @@ import math
import torch
from torch import nn
from torch.optim import lr_scheduler, SGD, Adam
from warmup_scheduler import GradualWarmupScheduler
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.optimizer import Optimizer
from torch.nn.modules.loss import _WeightedLoss, _Loss
@ -16,6 +15,7 @@ import statopt
from .config import Config
from .cocob import CocobBackprop
from .ml_losses import SmoothCrossEntropyLoss
from .warmup_scheduler import GradualWarmupScheduler
def create_optimizer(conf_opt:Config, params)->Optimizer:

Просмотреть файл

@ -0,0 +1,67 @@
# Credits: https://github.com/ildoonet/pytorch-gradual-warmup-lr
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
class GradualWarmupScheduler(_LRScheduler):
""" Gradually warm-up(increasing) learning rate in optimizer.
Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
Args:
optimizer (Optimizer): Wrapped optimizer.
multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
total_epoch: target learning rate is reached at total_epoch, gradually
after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
"""
def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
self.multiplier = multiplier
if self.multiplier < 1.:
raise ValueError(f'multiplier should be >= 1 but was {self.multiplier}')
self.total_epoch = total_epoch
self.after_scheduler = after_scheduler
self.finished = False
super(GradualWarmupScheduler, self).__init__(optimizer)
def get_lr(self):
if self.last_epoch > self.total_epoch:
if self.after_scheduler:
if not self.finished:
self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
self.finished = True
return self.after_scheduler.get_lr()
return [base_lr * self.multiplier for base_lr in self.base_lrs]
if self.multiplier == 1.0:
return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
else:
return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
def step_ReduceLROnPlateau(self, metrics, epoch=None):
if epoch is None:
epoch = self.last_epoch + 1
self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
if self.last_epoch <= self.total_epoch:
warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
param_group['lr'] = lr
else:
if epoch is None:
self.after_scheduler.step(metrics, None)
else:
self.after_scheduler.step(metrics, epoch - self.total_epoch)
def step(self, epoch=None, metrics=None):
if type(self.after_scheduler) != ReduceLROnPlateau:
if self.finished and self.after_scheduler:
if epoch is None:
self.after_scheduler.step(None)
else:
self.after_scheduler.step(epoch - self.total_epoch)
self._last_lr = self.after_scheduler.get_last_lr()
else:
return super(GradualWarmupScheduler, self).step(epoch)
else:
self.step_ReduceLROnPlateau(metrics, epoch)

Просмотреть файл

@ -10,7 +10,10 @@ common:
checkpoint:
filename: '$expdir/checkpoint.pth'
freq: 10
toy_mode: # this section will be used by toy.yaml to setup the toy mode
max_batches: 4
train_batch: 32
test_batch: 64
# TODO: workers setting
# reddis address of Ray cluster. Use None for single node run

Просмотреть файл

@ -3,6 +3,10 @@ common:
seed: 0.0
apex:
loss_scale: "128" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
toy_mode: # this section will be used by toy.yaml to setup the toy mode
max_batches: 25
train_batch: 64
test_batch: 64
dataset_eval:
name: 'imagenet'
@ -63,11 +67,11 @@ nas:
optimizer:
lr: 2.048 # init learning rate
decay: 3.05e-5
decay_bn: 0.0 # if NaN then same as decay otherwise apply different decay to BN layers
decay_bn: .NaN # if .NaN then same as decay otherwise apply different decay to BN layers
momentum: 0.875 # pytorch default is 0.0
lr_schedule:
type: 'cosine'
min_lr: 0.0 # min learning rate to se bet in eta_min param of scheduler
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
multiplier: 1
multiplier: 1.0
epochs: 8

Просмотреть файл

@ -17,29 +17,32 @@ nas:
trainer:
epochs: 0 # number of epochs model will be trained before search
loader:
train_batch: 32
test_batch: 64
train_batch: '_copy: common/toy_mode/train_batch'
test_batch: '_copy: common/toy_mode/test_batch'
dataset:
max_batches: 4
max_batches: '_copy: common/toy_mode/max_batches'
post_train:
trainer:
epochs: 1
loader:
train_batch: 32
test_batch: 64
train_batch: '_copy: common/toy_mode/train_batch'
test_batch: '_copy: common/toy_mode/test_batch'
dataset:
max_batches: 4
max_batches: '_copy: common/toy_mode/max_batches'
model_desc:
n_reductions: 1 # number of reductions to be applied
n_cells: 3 # number of cells
n_nodes: 2 # number of nodes in a cell
loader:
train_batch: 32
test_batch: 64
train_batch: '_copy: common/toy_mode/train_batch'
test_batch: '_copy: common/toy_mode/test_batch'
dataset:
max_batches: 4
max_batches: '_copy: common/toy_mode/max_batches'
trainer:
epochs: 1
logger_freq: 1
validation:
logger_freq: 1
eval:
data_parallel: False
checkpoint: null
@ -48,9 +51,13 @@ nas:
n_nodes: 4 # number of nodes in a cell
n_reductions: 2 # number of reductions to be applied
loader:
train_batch: 32
test_batch: 64
train_batch: '_copy: common/toy_mode/train_batch'
test_batch: '_copy: common/toy_mode/test_batch'
dataset:
max_batches: 2
max_batches: '_copy: common/toy_mode/max_batches'
trainer:
epochs: 1
logger_freq: 1
validation:
logger_freq: 1