зеркало из https://github.com/microsoft/archai.git
Added dedicated experiment runners for local search.
This commit is contained in:
Родитель
1505de5b14
Коммит
12c1902351
|
@ -39,8 +39,7 @@ class LocalNatsbenchTssFarExpRunner(ExperimentRunner):
|
|||
|
||||
@overrides
|
||||
def searcher(self)->Searcher:
|
||||
return LocalSearchNatsbenchTSSReg()
|
||||
#return LocalNatsbenchTssFarSearcher()
|
||||
return LocalNatsbenchTssFarSearcher()
|
||||
|
||||
@overrides
|
||||
def evaluater(self)->Evaluater:
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
import os
|
||||
from overrides.overrides import overrides
|
||||
from typing import List, Tuple
|
||||
import math as ma
|
||||
from copy import deepcopy
|
||||
|
||||
from archai.nas.discrete_search_space import DiscreteSearchSpace
|
||||
from archai.nas.searcher import Searcher, SearchResult
|
||||
from archai.common.common import logger
|
||||
from archai.common.config import Config
|
||||
from archai.common.trainer import Trainer
|
||||
from archai.algos.local_search.local_search import LocalSearch
|
||||
from archai.nas.arch_meta import ArchWithMetaData
|
||||
from archai.common import utils
|
||||
from archai.algos.proxynas.conditional_trainer import ConditionalTrainer
|
||||
from archai.algos.proxynas.freeze_trainer import FreezeTrainer
|
||||
from archai.search_spaces.discrete_search_spaces.natsbench_tss_search_spaces.discrete_search_space_natsbench_tss import DiscreteSearchSpaceNatsbenchTSS
|
||||
|
||||
|
||||
class LocalSearchNatsbenchTSSFear(LocalSearch):
|
||||
@overrides
|
||||
def search(self, conf_search:Config)->SearchResult:
|
||||
|
||||
# region config vars
|
||||
self.max_num_models = conf_search['max_num_models']
|
||||
self.ratio_fastest_duration = conf_search['ratio_fastest_duration']
|
||||
self.dataroot = utils.full_path(conf_search['loader']['dataset']['dataroot'])
|
||||
self.dataset_name = conf_search['loader']['dataset']['name']
|
||||
self.natsbench_location = os.path.join(self.dataroot, 'natsbench', conf_search['natsbench']['natsbench_tss_fast'])
|
||||
self.conf_train = conf_search['trainer']
|
||||
self.conf_loader = conf_search['loader']
|
||||
self.conf_train_freeze = conf_search['freeze_trainer']
|
||||
# endregion
|
||||
|
||||
# eval cache so that if local search visits
|
||||
# a network already evaluated then we don't
|
||||
# evaluate it again.
|
||||
self.eval_cache = {}
|
||||
|
||||
# cache of fear early rejects
|
||||
self.fear_early_rejects = {}
|
||||
|
||||
# keep track of the fastest to train to
|
||||
# threshold train/val accuracy
|
||||
self.fastest_cond_train = ma.inf
|
||||
|
||||
super().search(conf_search)
|
||||
|
||||
@overrides
|
||||
def get_search_space(self)->DiscreteSearchSpaceNatsbenchTSS:
|
||||
return DiscreteSearchSpaceNatsbenchTSS(self.dataset_name,
|
||||
self.natsbench_location)
|
||||
|
||||
|
||||
@overrides
|
||||
def get_max_num_models(self)->int:
|
||||
return self.max_num_models
|
||||
|
||||
|
||||
@overrides
|
||||
def _check_membership(self,
|
||||
archs_touched: List[ArchWithMetaData],
|
||||
arch: ArchWithMetaData) -> bool:
|
||||
is_member = False
|
||||
for archmeta in archs_touched:
|
||||
if archmeta.metadata['archid'] == arch.metadata['archid']:
|
||||
is_member = True
|
||||
return is_member
|
||||
|
||||
|
||||
@overrides
|
||||
def _log_local_minima(self, curr_arch: ArchWithMetaData,
|
||||
curr_acc: float,
|
||||
num_evaluated: int) -> None:
|
||||
logger.pushd(f'local_minima_{num_evaluated}')
|
||||
curr_archid = curr_arch.metadata['archid']
|
||||
info = self.search_space.api.get_more_info(curr_archid, self.dataset_name, hp=200, is_random=False)
|
||||
curr_test_acc = info['test-accuracy']
|
||||
local_minimum = (curr_archid, curr_acc, curr_test_acc)
|
||||
logger.info({'output': local_minimum})
|
||||
self.local_minima.append(local_minimum)
|
||||
logger.popd()
|
||||
|
||||
|
||||
@overrides
|
||||
def _find_best_minimum(self)->Tuple[int, float, float]:
|
||||
best_minimum = max(self.local_minima, key=lambda x:x[1])
|
||||
return best_minimum
|
||||
|
||||
|
||||
@overrides
|
||||
def _evaluate(self, arch:ArchWithMetaData)->float:
|
||||
|
||||
# see if we have visited this arch before
|
||||
if arch.metadata['archid'] in self.eval_cache:
|
||||
logger.info(f"{arch.metadata['archid']} is in cache! Returning from cache.")
|
||||
return self.eval_cache[arch.metadata['archid']].metadata['train_top1']
|
||||
|
||||
if arch.metadata['archid'] in self.fear_early_rejects:
|
||||
logger.info(f"{arch.metadata['archid']} has already been early rejected!")
|
||||
return
|
||||
|
||||
# if not in cache actually evaluate it
|
||||
# -------------------------------------
|
||||
# NOTE: we don't pass checkpoint to the trainers
|
||||
# as it creates complications and we don't need it
|
||||
# as these trainers are quite fast
|
||||
checkpoint = None
|
||||
|
||||
# if during conditional training it
|
||||
# starts exceeding fastest time to
|
||||
# reach threshold by a ratio then early
|
||||
# terminate it
|
||||
logger.pushd(f"conditional_training_{arch.metadata['archid']}")
|
||||
data_loaders = self.get_data(self.conf_loader)
|
||||
time_allowed = self.ratio_fastest_duration * self.fastest_cond_train
|
||||
cond_trainer = ConditionalTrainer(self.conf_train, arch.arch, checkpoint, time_allowed)
|
||||
cond_trainer_metrics = cond_trainer.fit(data_loaders)
|
||||
cond_train_time = cond_trainer_metrics.total_training_time()
|
||||
|
||||
if cond_train_time >= time_allowed:
|
||||
# this arch exceeded time to reach threshold
|
||||
# cut losses and move to next one
|
||||
logger.info(f"{arch.metadata['archid']} exceeded time allowed. Terminating and ignoring.")
|
||||
self.fear_early_rejects[arch.metadata['archid']] = arch
|
||||
logger.popd()
|
||||
return
|
||||
|
||||
if cond_train_time < self.fastest_cond_train:
|
||||
self.fastest_cond_train = cond_train_time
|
||||
logger.info(f'fastest condition train till now: {self.fastest_cond_train} seconds!')
|
||||
logger.popd()
|
||||
|
||||
# if we did not early terminate in conditional
|
||||
# training then freeze train
|
||||
# get data with new batch size for freeze training
|
||||
conf_loader_freeze = deepcopy(self.conf_loader)
|
||||
conf_loader_freeze['train_batch'] = self.conf_loader['freeze_loader']['train_batch']
|
||||
|
||||
logger.pushd(f"freeze_training_{arch.metadata['archid']}")
|
||||
data_loaders = self.get_data(conf_loader_freeze, to_cache=False)
|
||||
# now just finetune the last few layers
|
||||
checkpoint = None
|
||||
trainer = FreezeTrainer(self.conf_train_freeze, arch.arch, checkpoint)
|
||||
freeze_train_metrics = trainer.fit(data_loaders)
|
||||
logger.popd()# if we did not early terminate in conditional
|
||||
# training then freeze train
|
||||
# get data with new batch size for freeze training
|
||||
conf_loader_freeze = deepcopy(self.conf_loader)
|
||||
conf_loader_freeze['train_batch'] = self.conf_loader['freeze_loader']['train_batch']
|
||||
|
||||
logger.pushd(f"freeze_training_{arch.metadata['archid']}")
|
||||
data_loaders = self.get_data(conf_loader_freeze, to_cache=False)
|
||||
# now just finetune the last few layers
|
||||
checkpoint = None
|
||||
trainer = FreezeTrainer(self.conf_train_freeze, arch.arch, checkpoint)
|
||||
freeze_train_metrics = trainer.fit(data_loaders)
|
||||
logger.popd()
|
||||
|
||||
train_top1 = freeze_train_metrics.best_train_top1()
|
||||
arch.metadata['train_top1'] = train_top1
|
||||
# cache it
|
||||
self.eval_cache[arch.metadata['archid']] = arch
|
||||
return train_top1
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
from overrides import overrides
|
||||
from typing import Optional, Type, Tuple
|
||||
|
||||
from archai.nas.exp_runner import ExperimentRunner
|
||||
from archai.nas.model_desc_builder import ModelDescBuilder
|
||||
from archai.nas.arch_trainer import TArchTrainer
|
||||
from archai.common import common
|
||||
from archai.common import utils
|
||||
from archai.common.config import Config
|
||||
from archai.nas.evaluater import Evaluater, EvalResult
|
||||
from archai.nas.searcher import Searcher, SearchResult
|
||||
from archai.nas.finalizers import Finalizers
|
||||
from archai.nas.random_finalizers import RandomFinalizers
|
||||
from archai.nas.model_desc_builder import ModelDescBuilder
|
||||
from archai.algos.local_search_natsbench.local_search_natsbench_tss_fear import LocalSearchNatsbenchTSSFear
|
||||
|
||||
|
||||
|
||||
class LocalSearchNatsbenchTSSFearExpRunner(ExperimentRunner):
|
||||
''' Runs local search using FEAR on Natsbench space '''
|
||||
|
||||
@overrides
|
||||
def model_desc_builder(self)->Optional[ModelDescBuilder]:
|
||||
return None
|
||||
|
||||
@overrides
|
||||
def trainer_class(self)->TArchTrainer:
|
||||
return None # no search trainer
|
||||
|
||||
@overrides
|
||||
def run_search(self, conf_search:Config)->SearchResult:
|
||||
search = self.searcher()
|
||||
return search.search(conf_search)
|
||||
|
||||
@overrides
|
||||
def run_eval(self, conf_eval:Config)->EvalResult:
|
||||
evaler = self.evaluater()
|
||||
return evaler.evaluate(conf_eval)
|
||||
|
||||
@overrides
|
||||
def searcher(self)->Searcher:
|
||||
return LocalSearchNatsbenchTSSFear()
|
||||
|
||||
@overrides
|
||||
def evaluater(self)->Evaluater:
|
||||
return None
|
||||
|
||||
@overrides
|
||||
def copy_search_to_eval(self) -> None:
|
||||
return None
|
||||
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
from overrides import overrides
|
||||
from typing import Optional, Type, Tuple
|
||||
|
||||
from archai.nas.exp_runner import ExperimentRunner
|
||||
from archai.nas.model_desc_builder import ModelDescBuilder
|
||||
from archai.nas.arch_trainer import TArchTrainer
|
||||
from archai.common import common
|
||||
from archai.common import utils
|
||||
from archai.common.config import Config
|
||||
from archai.nas.evaluater import Evaluater, EvalResult
|
||||
from archai.nas.searcher import Searcher, SearchResult
|
||||
from archai.nas.finalizers import Finalizers
|
||||
from archai.nas.random_finalizers import RandomFinalizers
|
||||
from archai.nas.model_desc_builder import ModelDescBuilder
|
||||
from archai.algos.local_search_natsbench.local_search_natsbench_tss_reg import LocalSearchNatsbenchTSSReg
|
||||
|
||||
|
||||
|
||||
class LocalSearchNatsbenchTSSRegExpRunner(ExperimentRunner):
|
||||
''' Runs local search using regular evaluation on Natsbench space '''
|
||||
|
||||
@overrides
|
||||
def model_desc_builder(self)->Optional[ModelDescBuilder]:
|
||||
return None
|
||||
|
||||
@overrides
|
||||
def trainer_class(self)->TArchTrainer:
|
||||
return None # no search trainer
|
||||
|
||||
@overrides
|
||||
def run_search(self, conf_search:Config)->SearchResult:
|
||||
search = self.searcher()
|
||||
return search.search(conf_search)
|
||||
|
||||
@overrides
|
||||
def run_eval(self, conf_eval:Config)->EvalResult:
|
||||
evaler = self.evaluater()
|
||||
return evaler.evaluate(conf_eval)
|
||||
|
||||
@overrides
|
||||
def searcher(self)->Searcher:
|
||||
return LocalSearchNatsbenchTSSReg()
|
||||
|
||||
@overrides
|
||||
def evaluater(self)->Evaluater:
|
||||
return None
|
||||
|
||||
@overrides
|
||||
def copy_search_to_eval(self) -> None:
|
||||
return None
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ nas:
|
|||
grad_clip: 5.0 # grads above this value is clipped
|
||||
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
|
||||
title: 'arch_train'
|
||||
epochs: 2
|
||||
epochs: 20
|
||||
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
# additional vals for the derived class
|
||||
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
__include__: "../datasets/cifar10.yaml" # default dataset settings are for cifar
|
||||
|
||||
common:
|
||||
experiment_name: 'throwaway' # you should supply from command line
|
||||
experiment_desc: 'throwaway'
|
||||
logdir: '~/logdir'
|
||||
log_prefix: 'log' # prefix for log files that will becreated (log.log and log.yaml), no log files if ''
|
||||
log_level: 20 # logging.INFO
|
||||
backup_existing_log_file: False # should we overwrite existing log file without making a copy?
|
||||
yaml_log: True # if True, structured logs as yaml are also generated
|
||||
seed: 2.0
|
||||
tb_enable: False # if True then TensorBoard logging is enabled (may impact perf)
|
||||
tb_dir: '$expdir/tb' # path where tensorboard logs would be stored
|
||||
checkpoint:
|
||||
filename: '$expdir/checkpoint.pth'
|
||||
freq: 10
|
||||
|
||||
# reddis address of Ray cluster. Use None for single node run
|
||||
# otherwise it should something like host:6379. Make sure to run on head node:
|
||||
# "ray start --head --redis-port=6379"
|
||||
redis: null
|
||||
apex: # this is overriden in search and eval individually
|
||||
enabled: False # global switch to disable everything apex
|
||||
distributed_enabled: True # enable/disable distributed mode
|
||||
mixed_prec_enabled: True # switch to disable amp mixed precision
|
||||
gpus: '' # use GPU IDs specified here (comma separated), if '' then use all GPUs
|
||||
opt_level: 'O2' # optimization level for mixed precision
|
||||
bn_fp32: True # keep BN in fp32
|
||||
loss_scale: "dynamic" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
|
||||
sync_bn: False # should be replace BNs with sync BNs for distributed model
|
||||
scale_lr: True # enable/disable distributed mode
|
||||
min_world_size: 0 # allows to confirm we are indeed in distributed setting
|
||||
detect_anomaly: False # if True, PyTorch code will run 6X slower
|
||||
seed: '_copy: /common/seed'
|
||||
ray:
|
||||
enabled: False # initialize ray. Note: ray cannot be used if apex distributed is enabled
|
||||
local_mode: False # if True then ray runs in serial mode
|
||||
|
||||
smoke_test: False
|
||||
only_eval: False
|
||||
resume: True
|
||||
|
||||
dataset: {} # default dataset settings comes from __include__ on the top
|
||||
|
||||
nas:
|
||||
search:
|
||||
use_fear: True
|
||||
max_num_models: 300
|
||||
ratio_fastest_duration: 1.2
|
||||
natsbench:
|
||||
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
|
||||
finalizer: 'default' # options are 'random' or 'default'
|
||||
data_parallel: False
|
||||
checkpoint:
|
||||
_copy: '/common/checkpoint'
|
||||
resume: '_copy: /common/resume'
|
||||
full_desc_filename: '$expdir/full_model_desc.yaml' # arch before it was finalized
|
||||
final_desc_filename: '$expdir/final_model_desc.yaml' # final arch is saved in this file
|
||||
loader:
|
||||
apex:
|
||||
_copy: '../../trainer/apex'
|
||||
aug: '' # additional augmentations to use
|
||||
cutout: 0 # cutout length, use cutout augmentation when > 0
|
||||
load_train: True # load train split of dataset
|
||||
train_batch: 256
|
||||
freeze_loader:
|
||||
train_batch: 512 # batch size for freeze training.
|
||||
train_workers: 4 # if null then gpu_count*4
|
||||
test_workers: '_copy: ../train_workers' # if null then 4
|
||||
load_test: False # load test split of dataset
|
||||
test_batch: 1024
|
||||
val_ratio: 0.0 #split portion for test set, 0 to 1
|
||||
val_fold: 0 #Fold number to use (0 to 4)
|
||||
cv_num: 5 # total number of folds available
|
||||
dataset:
|
||||
_copy: '/dataset'
|
||||
trainer:
|
||||
use_val: False
|
||||
top1_acc_threshold: 0.1 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: 0.0
|
||||
drop_path_prob: 0.2 # probability that given edge will be dropped
|
||||
grad_clip: 5.0 # grads above this value is clipped
|
||||
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
|
||||
title: 'arch_train'
|
||||
epochs: 20
|
||||
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
# additional vals for the derived class
|
||||
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
|
||||
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
||||
optimizer:
|
||||
type: 'sgd'
|
||||
lr: 0.1 # init learning rate
|
||||
decay: 5.0e-4
|
||||
momentum: 0.9 # pytorch default is 0
|
||||
nesterov: True
|
||||
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
|
||||
lr_schedule:
|
||||
type: 'cosine'
|
||||
min_lr: 0.000 # min learning rate, this will be used in eta_min param of scheduler
|
||||
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
|
||||
multiplier: 1
|
||||
epochs: 0 # 0 disables warmup
|
||||
validation:
|
||||
title: 'search_val'
|
||||
logger_freq: 0
|
||||
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
freq: 1 # perform validation only every N epochs
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
||||
|
||||
freeze_trainer:
|
||||
plotsdir: ''
|
||||
identifiers_to_unfreeze: ['classifier', 'lastact', 'cells.16', 'cells.15', 'cells.14', 'cells.13'] # last few layer names in natsbench: lastact, lastact.0, lastact.1: BN-Relu, global_pooling: global avg. pooling (doesn't get exposed as a named param though), classifier: linear layer
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: 0.0 # very important that this is 0.0 for freeze training
|
||||
drop_path_prob: 0.0 # very important that this is 0.0 for freeze training
|
||||
grad_clip: 5.0 # grads above this value is clipped
|
||||
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
|
||||
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
|
||||
title: 'eval_train'
|
||||
epochs: 10
|
||||
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
||||
optimizer:
|
||||
type: 'sgd'
|
||||
lr: 0.1 # init learning rate
|
||||
decay: 5.0e-4 # pytorch default is 0.0
|
||||
momentum: 0.9 # pytorch default is 0.0
|
||||
nesterov: True # pytorch default is False
|
||||
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
|
||||
lr_schedule:
|
||||
type: 'cosine'
|
||||
min_lr: 0.000 # min learning rate to be set in eta_min param of scheduler
|
||||
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
|
||||
multiplier: 1
|
||||
epochs: 0 # 0 disables warmup
|
||||
validation:
|
||||
title: 'eval_test'
|
||||
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
logger_freq: 0
|
||||
freq: 1 # perform validation only every N epochs
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
|
@ -0,0 +1,113 @@
|
|||
__include__: "../datasets/cifar10.yaml" # default dataset settings are for cifar
|
||||
|
||||
common:
|
||||
experiment_name: 'throwaway' # you should supply from command line
|
||||
experiment_desc: 'throwaway'
|
||||
logdir: '~/logdir'
|
||||
log_prefix: 'log' # prefix for log files that will becreated (log.log and log.yaml), no log files if ''
|
||||
log_level: 20 # logging.INFO
|
||||
backup_existing_log_file: False # should we overwrite existing log file without making a copy?
|
||||
yaml_log: True # if True, structured logs as yaml are also generated
|
||||
seed: 2.0
|
||||
tb_enable: False # if True then TensorBoard logging is enabled (may impact perf)
|
||||
tb_dir: '$expdir/tb' # path where tensorboard logs would be stored
|
||||
checkpoint:
|
||||
filename: '$expdir/checkpoint.pth'
|
||||
freq: 10
|
||||
|
||||
# reddis address of Ray cluster. Use None for single node run
|
||||
# otherwise it should something like host:6379. Make sure to run on head node:
|
||||
# "ray start --head --redis-port=6379"
|
||||
redis: null
|
||||
apex: # this is overriden in search and eval individually
|
||||
enabled: False # global switch to disable everything apex
|
||||
distributed_enabled: True # enable/disable distributed mode
|
||||
mixed_prec_enabled: True # switch to disable amp mixed precision
|
||||
gpus: '' # use GPU IDs specified here (comma separated), if '' then use all GPUs
|
||||
opt_level: 'O2' # optimization level for mixed precision
|
||||
bn_fp32: True # keep BN in fp32
|
||||
loss_scale: "dynamic" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
|
||||
sync_bn: False # should be replace BNs with sync BNs for distributed model
|
||||
scale_lr: True # enable/disable distributed mode
|
||||
min_world_size: 0 # allows to confirm we are indeed in distributed setting
|
||||
detect_anomaly: False # if True, PyTorch code will run 6X slower
|
||||
seed: '_copy: /common/seed'
|
||||
ray:
|
||||
enabled: False # initialize ray. Note: ray cannot be used if apex distributed is enabled
|
||||
local_mode: False # if True then ray runs in serial mode
|
||||
|
||||
smoke_test: False
|
||||
only_eval: False
|
||||
resume: True
|
||||
|
||||
dataset: {} # default dataset settings comes from __include__ on the top
|
||||
|
||||
nas:
|
||||
search:
|
||||
use_fear: True
|
||||
max_num_models: 300
|
||||
ratio_fastest_duration: 1.2
|
||||
natsbench:
|
||||
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
|
||||
finalizer: 'default' # options are 'random' or 'default'
|
||||
data_parallel: False
|
||||
checkpoint:
|
||||
_copy: '/common/checkpoint'
|
||||
resume: '_copy: /common/resume'
|
||||
full_desc_filename: '$expdir/full_model_desc.yaml' # arch before it was finalized
|
||||
final_desc_filename: '$expdir/final_model_desc.yaml' # final arch is saved in this file
|
||||
loader:
|
||||
apex:
|
||||
_copy: '../../trainer/apex'
|
||||
aug: '' # additional augmentations to use
|
||||
cutout: 0 # cutout length, use cutout augmentation when > 0
|
||||
load_train: True # load train split of dataset
|
||||
train_batch: 256
|
||||
freeze_loader:
|
||||
train_batch: 512 # batch size for freeze training.
|
||||
train_workers: 4 # if null then gpu_count*4
|
||||
test_workers: '_copy: ../train_workers' # if null then 4
|
||||
load_test: False # load test split of dataset
|
||||
test_batch: 1024
|
||||
val_ratio: 0.0 #split portion for test set, 0 to 1
|
||||
val_fold: 0 #Fold number to use (0 to 4)
|
||||
cv_num: 5 # total number of folds available
|
||||
dataset:
|
||||
_copy: '/dataset'
|
||||
trainer:
|
||||
use_val: False
|
||||
top1_acc_threshold: 0.1 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: 0.0
|
||||
drop_path_prob: 0.2 # probability that given edge will be dropped
|
||||
grad_clip: 5.0 # grads above this value is clipped
|
||||
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
|
||||
title: 'arch_train'
|
||||
epochs: 20
|
||||
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
# additional vals for the derived class
|
||||
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
|
||||
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
||||
optimizer:
|
||||
type: 'sgd'
|
||||
lr: 0.1 # init learning rate
|
||||
decay: 5.0e-4
|
||||
momentum: 0.9 # pytorch default is 0
|
||||
nesterov: True
|
||||
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
|
||||
lr_schedule:
|
||||
type: 'cosine'
|
||||
min_lr: 0.000 # min learning rate, this will be used in eta_min param of scheduler
|
||||
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
|
||||
multiplier: 1
|
||||
epochs: 0 # 0 disables warmup
|
||||
validation:
|
||||
title: 'search_val'
|
||||
logger_freq: 0
|
||||
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
|
||||
freq: 1 # perform validation only every N epochs
|
||||
lossfn:
|
||||
type: 'CrossEntropyLoss'
|
|
@ -34,6 +34,9 @@ from archai.algos.random_natsbench.random_natsbench_tss_reg_exp_runner import Ra
|
|||
from archai.algos.random_darts.random_dartsspace_reg_exp_runner import RandomDartsSpaceRegExpRunner
|
||||
from archai.algos.random_darts.random_dartsspace_far_exp_runner import RandomDartsSpaceFarExpRunner
|
||||
from archai.algos.local_search_natsbench.local_natsbench_tss_far_exp_runner import LocalNatsbenchTssFarExpRunner
|
||||
from archai.algos.local_search_natsbench.local_search_natsbench_tss_fear_exp_runner import LocalSearchNatsbenchTSSFearExpRunner
|
||||
from archai.algos.local_search_natsbench.local_search_natsbench_tss_reg_exp_runner import LocalSearchNatsbenchTSSRegExpRunner
|
||||
|
||||
|
||||
def main():
|
||||
runner_types:Dict[str, Type[ExperimentRunner]] = {
|
||||
|
@ -64,7 +67,9 @@ def main():
|
|||
'random_natsbench_tss_reg': RandomNatsbenchTssRegExpRunner,
|
||||
'random_dartsspace_reg': RandomDartsSpaceRegExpRunner,
|
||||
'random_dartsspace_far': RandomDartsSpaceFarExpRunner,
|
||||
'local_natsbench_tss_far': LocalNatsbenchTssFarExpRunner
|
||||
'local_natsbench_tss_far': LocalNatsbenchTssFarExpRunner,
|
||||
'local_search_natsbench_tss_reg': LocalSearchNatsbenchTSSRegExpRunner,
|
||||
'local_search_natsbench_tss_fear': LocalSearchNatsbenchTSSFearExpRunner
|
||||
}
|
||||
|
||||
parser = argparse.ArgumentParser(description='NAS E2E Runs')
|
||||
|
|
Загрузка…
Ссылка в новой задаче