Added dedicated experiment runners for local search.

This commit is contained in:
Debadeepta Dey 2021-10-28 21:19:07 -07:00 коммит произвёл Gustavo Rosa
Родитель 1505de5b14
Коммит 12c1902351
8 изменённых файлов: 542 добавлений и 4 удалений

Просмотреть файл

@ -39,8 +39,7 @@ class LocalNatsbenchTssFarExpRunner(ExperimentRunner):
@overrides
def searcher(self)->Searcher:
return LocalSearchNatsbenchTSSReg()
#return LocalNatsbenchTssFarSearcher()
return LocalNatsbenchTssFarSearcher()
@overrides
def evaluater(self)->Evaluater:

Просмотреть файл

@ -0,0 +1,168 @@
import os
from overrides.overrides import overrides
from typing import List, Tuple
import math as ma
from copy import deepcopy
from archai.nas.discrete_search_space import DiscreteSearchSpace
from archai.nas.searcher import Searcher, SearchResult
from archai.common.common import logger
from archai.common.config import Config
from archai.common.trainer import Trainer
from archai.algos.local_search.local_search import LocalSearch
from archai.nas.arch_meta import ArchWithMetaData
from archai.common import utils
from archai.algos.proxynas.conditional_trainer import ConditionalTrainer
from archai.algos.proxynas.freeze_trainer import FreezeTrainer
from archai.search_spaces.discrete_search_spaces.natsbench_tss_search_spaces.discrete_search_space_natsbench_tss import DiscreteSearchSpaceNatsbenchTSS
class LocalSearchNatsbenchTSSFear(LocalSearch):
@overrides
def search(self, conf_search:Config)->SearchResult:
# region config vars
self.max_num_models = conf_search['max_num_models']
self.ratio_fastest_duration = conf_search['ratio_fastest_duration']
self.dataroot = utils.full_path(conf_search['loader']['dataset']['dataroot'])
self.dataset_name = conf_search['loader']['dataset']['name']
self.natsbench_location = os.path.join(self.dataroot, 'natsbench', conf_search['natsbench']['natsbench_tss_fast'])
self.conf_train = conf_search['trainer']
self.conf_loader = conf_search['loader']
self.conf_train_freeze = conf_search['freeze_trainer']
# endregion
# eval cache so that if local search visits
# a network already evaluated then we don't
# evaluate it again.
self.eval_cache = {}
# cache of fear early rejects
self.fear_early_rejects = {}
# keep track of the fastest to train to
# threshold train/val accuracy
self.fastest_cond_train = ma.inf
super().search(conf_search)
@overrides
def get_search_space(self)->DiscreteSearchSpaceNatsbenchTSS:
return DiscreteSearchSpaceNatsbenchTSS(self.dataset_name,
self.natsbench_location)
@overrides
def get_max_num_models(self)->int:
return self.max_num_models
@overrides
def _check_membership(self,
archs_touched: List[ArchWithMetaData],
arch: ArchWithMetaData) -> bool:
is_member = False
for archmeta in archs_touched:
if archmeta.metadata['archid'] == arch.metadata['archid']:
is_member = True
return is_member
@overrides
def _log_local_minima(self, curr_arch: ArchWithMetaData,
curr_acc: float,
num_evaluated: int) -> None:
logger.pushd(f'local_minima_{num_evaluated}')
curr_archid = curr_arch.metadata['archid']
info = self.search_space.api.get_more_info(curr_archid, self.dataset_name, hp=200, is_random=False)
curr_test_acc = info['test-accuracy']
local_minimum = (curr_archid, curr_acc, curr_test_acc)
logger.info({'output': local_minimum})
self.local_minima.append(local_minimum)
logger.popd()
@overrides
def _find_best_minimum(self)->Tuple[int, float, float]:
best_minimum = max(self.local_minima, key=lambda x:x[1])
return best_minimum
@overrides
def _evaluate(self, arch:ArchWithMetaData)->float:
# see if we have visited this arch before
if arch.metadata['archid'] in self.eval_cache:
logger.info(f"{arch.metadata['archid']} is in cache! Returning from cache.")
return self.eval_cache[arch.metadata['archid']].metadata['train_top1']
if arch.metadata['archid'] in self.fear_early_rejects:
logger.info(f"{arch.metadata['archid']} has already been early rejected!")
return
# if not in cache actually evaluate it
# -------------------------------------
# NOTE: we don't pass checkpoint to the trainers
# as it creates complications and we don't need it
# as these trainers are quite fast
checkpoint = None
# if during conditional training it
# starts exceeding fastest time to
# reach threshold by a ratio then early
# terminate it
logger.pushd(f"conditional_training_{arch.metadata['archid']}")
data_loaders = self.get_data(self.conf_loader)
time_allowed = self.ratio_fastest_duration * self.fastest_cond_train
cond_trainer = ConditionalTrainer(self.conf_train, arch.arch, checkpoint, time_allowed)
cond_trainer_metrics = cond_trainer.fit(data_loaders)
cond_train_time = cond_trainer_metrics.total_training_time()
if cond_train_time >= time_allowed:
# this arch exceeded time to reach threshold
# cut losses and move to next one
logger.info(f"{arch.metadata['archid']} exceeded time allowed. Terminating and ignoring.")
self.fear_early_rejects[arch.metadata['archid']] = arch
logger.popd()
return
if cond_train_time < self.fastest_cond_train:
self.fastest_cond_train = cond_train_time
logger.info(f'fastest condition train till now: {self.fastest_cond_train} seconds!')
logger.popd()
# if we did not early terminate in conditional
# training then freeze train
# get data with new batch size for freeze training
conf_loader_freeze = deepcopy(self.conf_loader)
conf_loader_freeze['train_batch'] = self.conf_loader['freeze_loader']['train_batch']
logger.pushd(f"freeze_training_{arch.metadata['archid']}")
data_loaders = self.get_data(conf_loader_freeze, to_cache=False)
# now just finetune the last few layers
checkpoint = None
trainer = FreezeTrainer(self.conf_train_freeze, arch.arch, checkpoint)
freeze_train_metrics = trainer.fit(data_loaders)
logger.popd()# if we did not early terminate in conditional
# training then freeze train
# get data with new batch size for freeze training
conf_loader_freeze = deepcopy(self.conf_loader)
conf_loader_freeze['train_batch'] = self.conf_loader['freeze_loader']['train_batch']
logger.pushd(f"freeze_training_{arch.metadata['archid']}")
data_loaders = self.get_data(conf_loader_freeze, to_cache=False)
# now just finetune the last few layers
checkpoint = None
trainer = FreezeTrainer(self.conf_train_freeze, arch.arch, checkpoint)
freeze_train_metrics = trainer.fit(data_loaders)
logger.popd()
train_top1 = freeze_train_metrics.best_train_top1()
arch.metadata['train_top1'] = train_top1
# cache it
self.eval_cache[arch.metadata['archid']] = arch
return train_top1

Просмотреть файл

@ -0,0 +1,52 @@
from overrides import overrides
from typing import Optional, Type, Tuple
from archai.nas.exp_runner import ExperimentRunner
from archai.nas.model_desc_builder import ModelDescBuilder
from archai.nas.arch_trainer import TArchTrainer
from archai.common import common
from archai.common import utils
from archai.common.config import Config
from archai.nas.evaluater import Evaluater, EvalResult
from archai.nas.searcher import Searcher, SearchResult
from archai.nas.finalizers import Finalizers
from archai.nas.random_finalizers import RandomFinalizers
from archai.nas.model_desc_builder import ModelDescBuilder
from archai.algos.local_search_natsbench.local_search_natsbench_tss_fear import LocalSearchNatsbenchTSSFear
class LocalSearchNatsbenchTSSFearExpRunner(ExperimentRunner):
''' Runs local search using FEAR on Natsbench space '''
@overrides
def model_desc_builder(self)->Optional[ModelDescBuilder]:
return None
@overrides
def trainer_class(self)->TArchTrainer:
return None # no search trainer
@overrides
def run_search(self, conf_search:Config)->SearchResult:
search = self.searcher()
return search.search(conf_search)
@overrides
def run_eval(self, conf_eval:Config)->EvalResult:
evaler = self.evaluater()
return evaler.evaluate(conf_eval)
@overrides
def searcher(self)->Searcher:
return LocalSearchNatsbenchTSSFear()
@overrides
def evaluater(self)->Evaluater:
return None
@overrides
def copy_search_to_eval(self) -> None:
return None

Просмотреть файл

@ -0,0 +1,52 @@
from overrides import overrides
from typing import Optional, Type, Tuple
from archai.nas.exp_runner import ExperimentRunner
from archai.nas.model_desc_builder import ModelDescBuilder
from archai.nas.arch_trainer import TArchTrainer
from archai.common import common
from archai.common import utils
from archai.common.config import Config
from archai.nas.evaluater import Evaluater, EvalResult
from archai.nas.searcher import Searcher, SearchResult
from archai.nas.finalizers import Finalizers
from archai.nas.random_finalizers import RandomFinalizers
from archai.nas.model_desc_builder import ModelDescBuilder
from archai.algos.local_search_natsbench.local_search_natsbench_tss_reg import LocalSearchNatsbenchTSSReg
class LocalSearchNatsbenchTSSRegExpRunner(ExperimentRunner):
''' Runs local search using regular evaluation on Natsbench space '''
@overrides
def model_desc_builder(self)->Optional[ModelDescBuilder]:
return None
@overrides
def trainer_class(self)->TArchTrainer:
return None # no search trainer
@overrides
def run_search(self, conf_search:Config)->SearchResult:
search = self.searcher()
return search.search(conf_search)
@overrides
def run_eval(self, conf_eval:Config)->EvalResult:
evaler = self.evaluater()
return evaler.evaluate(conf_eval)
@overrides
def searcher(self)->Searcher:
return LocalSearchNatsbenchTSSReg()
@overrides
def evaluater(self)->Evaluater:
return None
@overrides
def copy_search_to_eval(self) -> None:
return None

Просмотреть файл

@ -84,7 +84,7 @@ nas:
grad_clip: 5.0 # grads above this value is clipped
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'arch_train'
epochs: 2
epochs: 20
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
# additional vals for the derived class
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir

Просмотреть файл

@ -0,0 +1,149 @@
__include__: "../datasets/cifar10.yaml" # default dataset settings are for cifar
common:
experiment_name: 'throwaway' # you should supply from command line
experiment_desc: 'throwaway'
logdir: '~/logdir'
log_prefix: 'log' # prefix for log files that will becreated (log.log and log.yaml), no log files if ''
log_level: 20 # logging.INFO
backup_existing_log_file: False # should we overwrite existing log file without making a copy?
yaml_log: True # if True, structured logs as yaml are also generated
seed: 2.0
tb_enable: False # if True then TensorBoard logging is enabled (may impact perf)
tb_dir: '$expdir/tb' # path where tensorboard logs would be stored
checkpoint:
filename: '$expdir/checkpoint.pth'
freq: 10
# reddis address of Ray cluster. Use None for single node run
# otherwise it should something like host:6379. Make sure to run on head node:
# "ray start --head --redis-port=6379"
redis: null
apex: # this is overriden in search and eval individually
enabled: False # global switch to disable everything apex
distributed_enabled: True # enable/disable distributed mode
mixed_prec_enabled: True # switch to disable amp mixed precision
gpus: '' # use GPU IDs specified here (comma separated), if '' then use all GPUs
opt_level: 'O2' # optimization level for mixed precision
bn_fp32: True # keep BN in fp32
loss_scale: "dynamic" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
sync_bn: False # should be replace BNs with sync BNs for distributed model
scale_lr: True # enable/disable distributed mode
min_world_size: 0 # allows to confirm we are indeed in distributed setting
detect_anomaly: False # if True, PyTorch code will run 6X slower
seed: '_copy: /common/seed'
ray:
enabled: False # initialize ray. Note: ray cannot be used if apex distributed is enabled
local_mode: False # if True then ray runs in serial mode
smoke_test: False
only_eval: False
resume: True
dataset: {} # default dataset settings comes from __include__ on the top
nas:
search:
use_fear: True
max_num_models: 300
ratio_fastest_duration: 1.2
natsbench:
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
finalizer: 'default' # options are 'random' or 'default'
data_parallel: False
checkpoint:
_copy: '/common/checkpoint'
resume: '_copy: /common/resume'
full_desc_filename: '$expdir/full_model_desc.yaml' # arch before it was finalized
final_desc_filename: '$expdir/final_model_desc.yaml' # final arch is saved in this file
loader:
apex:
_copy: '../../trainer/apex'
aug: '' # additional augmentations to use
cutout: 0 # cutout length, use cutout augmentation when > 0
load_train: True # load train split of dataset
train_batch: 256
freeze_loader:
train_batch: 512 # batch size for freeze training.
train_workers: 4 # if null then gpu_count*4
test_workers: '_copy: ../train_workers' # if null then 4
load_test: False # load test split of dataset
test_batch: 1024
val_ratio: 0.0 #split portion for test set, 0 to 1
val_fold: 0 #Fold number to use (0 to 4)
cv_num: 5 # total number of folds available
dataset:
_copy: '/dataset'
trainer:
use_val: False
top1_acc_threshold: 0.1 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: 0.0
drop_path_prob: 0.2 # probability that given edge will be dropped
grad_clip: 5.0 # grads above this value is clipped
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'arch_train'
epochs: 20
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
# additional vals for the derived class
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
lossfn:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.1 # init learning rate
decay: 5.0e-4
momentum: 0.9 # pytorch default is 0
nesterov: True
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
lr_schedule:
type: 'cosine'
min_lr: 0.000 # min learning rate, this will be used in eta_min param of scheduler
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
multiplier: 1
epochs: 0 # 0 disables warmup
validation:
title: 'search_val'
logger_freq: 0
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
freq: 1 # perform validation only every N epochs
lossfn:
type: 'CrossEntropyLoss'
freeze_trainer:
plotsdir: ''
identifiers_to_unfreeze: ['classifier', 'lastact', 'cells.16', 'cells.15', 'cells.14', 'cells.13'] # last few layer names in natsbench: lastact, lastact.0, lastact.1: BN-Relu, global_pooling: global avg. pooling (doesn't get exposed as a named param though), classifier: linear layer
apex:
_copy: '/common/apex'
aux_weight: 0.0 # very important that this is 0.0 for freeze training
drop_path_prob: 0.0 # very important that this is 0.0 for freeze training
grad_clip: 5.0 # grads above this value is clipped
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'eval_train'
epochs: 10
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
lossfn:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.1 # init learning rate
decay: 5.0e-4 # pytorch default is 0.0
momentum: 0.9 # pytorch default is 0.0
nesterov: True # pytorch default is False
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
lr_schedule:
type: 'cosine'
min_lr: 0.000 # min learning rate to be set in eta_min param of scheduler
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
multiplier: 1
epochs: 0 # 0 disables warmup
validation:
title: 'eval_test'
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
logger_freq: 0
freq: 1 # perform validation only every N epochs
lossfn:
type: 'CrossEntropyLoss'

Просмотреть файл

@ -0,0 +1,113 @@
__include__: "../datasets/cifar10.yaml" # default dataset settings are for cifar
common:
experiment_name: 'throwaway' # you should supply from command line
experiment_desc: 'throwaway'
logdir: '~/logdir'
log_prefix: 'log' # prefix for log files that will becreated (log.log and log.yaml), no log files if ''
log_level: 20 # logging.INFO
backup_existing_log_file: False # should we overwrite existing log file without making a copy?
yaml_log: True # if True, structured logs as yaml are also generated
seed: 2.0
tb_enable: False # if True then TensorBoard logging is enabled (may impact perf)
tb_dir: '$expdir/tb' # path where tensorboard logs would be stored
checkpoint:
filename: '$expdir/checkpoint.pth'
freq: 10
# reddis address of Ray cluster. Use None for single node run
# otherwise it should something like host:6379. Make sure to run on head node:
# "ray start --head --redis-port=6379"
redis: null
apex: # this is overriden in search and eval individually
enabled: False # global switch to disable everything apex
distributed_enabled: True # enable/disable distributed mode
mixed_prec_enabled: True # switch to disable amp mixed precision
gpus: '' # use GPU IDs specified here (comma separated), if '' then use all GPUs
opt_level: 'O2' # optimization level for mixed precision
bn_fp32: True # keep BN in fp32
loss_scale: "dynamic" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
sync_bn: False # should be replace BNs with sync BNs for distributed model
scale_lr: True # enable/disable distributed mode
min_world_size: 0 # allows to confirm we are indeed in distributed setting
detect_anomaly: False # if True, PyTorch code will run 6X slower
seed: '_copy: /common/seed'
ray:
enabled: False # initialize ray. Note: ray cannot be used if apex distributed is enabled
local_mode: False # if True then ray runs in serial mode
smoke_test: False
only_eval: False
resume: True
dataset: {} # default dataset settings comes from __include__ on the top
nas:
search:
use_fear: True
max_num_models: 300
ratio_fastest_duration: 1.2
natsbench:
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
finalizer: 'default' # options are 'random' or 'default'
data_parallel: False
checkpoint:
_copy: '/common/checkpoint'
resume: '_copy: /common/resume'
full_desc_filename: '$expdir/full_model_desc.yaml' # arch before it was finalized
final_desc_filename: '$expdir/final_model_desc.yaml' # final arch is saved in this file
loader:
apex:
_copy: '../../trainer/apex'
aug: '' # additional augmentations to use
cutout: 0 # cutout length, use cutout augmentation when > 0
load_train: True # load train split of dataset
train_batch: 256
freeze_loader:
train_batch: 512 # batch size for freeze training.
train_workers: 4 # if null then gpu_count*4
test_workers: '_copy: ../train_workers' # if null then 4
load_test: False # load test split of dataset
test_batch: 1024
val_ratio: 0.0 #split portion for test set, 0 to 1
val_fold: 0 #Fold number to use (0 to 4)
cv_num: 5 # total number of folds available
dataset:
_copy: '/dataset'
trainer:
use_val: False
top1_acc_threshold: 0.1 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: 0.0
drop_path_prob: 0.2 # probability that given edge will be dropped
grad_clip: 5.0 # grads above this value is clipped
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'arch_train'
epochs: 20
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
# additional vals for the derived class
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
lossfn:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.1 # init learning rate
decay: 5.0e-4
momentum: 0.9 # pytorch default is 0
nesterov: True
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
lr_schedule:
type: 'cosine'
min_lr: 0.000 # min learning rate, this will be used in eta_min param of scheduler
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
multiplier: 1
epochs: 0 # 0 disables warmup
validation:
title: 'search_val'
logger_freq: 0
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
freq: 1 # perform validation only every N epochs
lossfn:
type: 'CrossEntropyLoss'

Просмотреть файл

@ -34,6 +34,9 @@ from archai.algos.random_natsbench.random_natsbench_tss_reg_exp_runner import Ra
from archai.algos.random_darts.random_dartsspace_reg_exp_runner import RandomDartsSpaceRegExpRunner
from archai.algos.random_darts.random_dartsspace_far_exp_runner import RandomDartsSpaceFarExpRunner
from archai.algos.local_search_natsbench.local_natsbench_tss_far_exp_runner import LocalNatsbenchTssFarExpRunner
from archai.algos.local_search_natsbench.local_search_natsbench_tss_fear_exp_runner import LocalSearchNatsbenchTSSFearExpRunner
from archai.algos.local_search_natsbench.local_search_natsbench_tss_reg_exp_runner import LocalSearchNatsbenchTSSRegExpRunner
def main():
runner_types:Dict[str, Type[ExperimentRunner]] = {
@ -64,7 +67,9 @@ def main():
'random_natsbench_tss_reg': RandomNatsbenchTssRegExpRunner,
'random_dartsspace_reg': RandomDartsSpaceRegExpRunner,
'random_dartsspace_far': RandomDartsSpaceFarExpRunner,
'local_natsbench_tss_far': LocalNatsbenchTssFarExpRunner
'local_natsbench_tss_far': LocalNatsbenchTssFarExpRunner,
'local_search_natsbench_tss_reg': LocalSearchNatsbenchTSSRegExpRunner,
'local_search_natsbench_tss_fear': LocalSearchNatsbenchTSSFearExpRunner
}
parser = argparse.ArgumentParser(description='NAS E2E Runs')