зеркало из https://github.com/microsoft/archai.git
Getting ready to run large batch cell13 natsbench without augmentation.
This commit is contained in:
Родитель
247a2764b9
Коммит
ef0d9e1e03
|
@ -469,7 +469,7 @@
|
|||
"request": "launch",
|
||||
"program": "${cwd}/scripts/reports/analysis_freeze_natsbench_space.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxynas_nb_cell13_freeze0.8",
|
||||
"args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxy_nb_fast_cell13_freeze0.6_60",
|
||||
"--out-dir", "C:\\Users\\dedey\\archai_experiment_reports"]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -60,7 +60,8 @@ class FreezeNatsbenchExperimentRunner(ExperimentRunner):
|
|||
logger.pushd('regular_evaluate')
|
||||
arch_id = conf_eval['natsbench']['arch_index']
|
||||
dataroot = utils.full_path(conf_eval['loader']['dataset']['dataroot'])
|
||||
natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])
|
||||
natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])
|
||||
logger.info(natsbench_location)
|
||||
dataset_name = conf_eval['loader']['dataset']['name']
|
||||
|
||||
api = create(natsbench_location, 'tss', fast_mode=True, verbose=True)
|
||||
|
|
|
@ -13,8 +13,8 @@ nas:
|
|||
model_desc:
|
||||
num_edges_to_sample: 2
|
||||
loader:
|
||||
train_batch: 2048 # natsbench uses 256
|
||||
aug: 'fa_reduced_cifar10' # In natsbench paper they use random flip and crop, we are doing lot more here
|
||||
train_batch: 1024 # natsbench uses 256
|
||||
aug: '' # random flip and crop are already there in default params
|
||||
trainer: # matching natsbench paper closely
|
||||
plotsdir: ''
|
||||
apex:
|
||||
|
|
|
@ -8,7 +8,7 @@ nas:
|
|||
|
||||
eval:
|
||||
nasbench101:
|
||||
arch_index: 1891
|
||||
arch_index: 6758
|
||||
model_desc:
|
||||
num_edges_to_sample: 2
|
||||
loader:
|
||||
|
@ -20,7 +20,7 @@ nas:
|
|||
train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
|
||||
trainer:
|
||||
plotsdir: ''
|
||||
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
|
||||
val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
|
||||
|
|
|
@ -8,20 +8,20 @@ nas:
|
|||
|
||||
eval:
|
||||
natsbench:
|
||||
arch_index: 1891
|
||||
arch_index: 6758
|
||||
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
|
||||
model_desc:
|
||||
num_edges_to_sample: 2
|
||||
loader:
|
||||
train_batch: 1024 # 1024 works reliably on V100. 2048 causes issues.
|
||||
aug: 'fa_reduced_cifar10' # in natsbench paper they use random flip and crop, we are doing lot more here
|
||||
train_batch: 1024 # 1024 and 2048 may be causing hang issues on cluster!
|
||||
aug: '' # in natsbench paper they use random flip and crop, which are part of the regular transforms
|
||||
naswotrain:
|
||||
train_batch: 256 # batch size for computing trainingless score
|
||||
freeze_loader:
|
||||
train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
|
||||
trainer:
|
||||
plotsdir: ''
|
||||
val_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
|
||||
val_top1_acc_threshold: 0.6 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
|
||||
|
@ -36,7 +36,7 @@ nas:
|
|||
type: 'CrossEntropyLoss'
|
||||
optimizer:
|
||||
type: 'sgd'
|
||||
lr: 0.1 # init learning rate
|
||||
lr: 0.5 # init learning rate
|
||||
decay: 5.0e-4 # pytorch default is 0.0
|
||||
momentum: 0.9 # pytorch default is 0.0
|
||||
nesterov: True # pytorch default is False
|
||||
|
|
|
@ -93,31 +93,7 @@ def main():
|
|||
for key, data in a:
|
||||
logs[key] = data
|
||||
|
||||
# # single process parsing of yaml logs
|
||||
# for job_dir in tqdm(results_dir.iterdir()):
|
||||
# if job_dir.is_dir():
|
||||
# for subdir in job_dir.iterdir():
|
||||
# if not subdir.is_dir():
|
||||
# continue
|
||||
# # currently we expect that each job was ExperimentRunner job which should have
|
||||
# # _search or _eval folders
|
||||
# if subdir.stem.endswith('_search'):
|
||||
# sub_job = 'search'
|
||||
# elif subdir.stem.endswith('_eval'):
|
||||
# sub_job = 'eval'
|
||||
# else:
|
||||
# raise RuntimeError(f'Sub directory "{subdir}" in job "{job_dir}" must '
|
||||
# 'end with either _search or _eval which '
|
||||
# 'should be the case if ExperimentRunner was used.')
|
||||
|
||||
# logs_filepath = os.path.join(str(subdir), 'log.yaml')
|
||||
# if os.path.isfile(logs_filepath):
|
||||
# fix_yaml(logs_filepath)
|
||||
# with open(logs_filepath, 'r') as f:
|
||||
# key = job_dir.name + ':' + sub_job
|
||||
# logs[key] = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
|
||||
|
||||
# examples of accessing logs
|
||||
# logs['proxynas_blahblah:eval']['naswotrain_evaluate']['eval_arch']['eval_train']['naswithouttraining']
|
||||
# logs['proxynas_blahblah:eval']['regular_evaluate']['regtrainingtop1']
|
||||
|
@ -140,7 +116,12 @@ def main():
|
|||
if 'eval' in key:
|
||||
try:
|
||||
|
||||
# TODO: if at the end of conditional training val accuracy has not gone above target then don't consider it
|
||||
# if at the end of conditional training val accuracy has not gone above target then don't consider it
|
||||
last_cond_epoch_key = list(logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'].keys())[-1]
|
||||
val_end_cond = logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'][last_cond_epoch_key]['val']['top1']
|
||||
if val_end_cond < 0.6:
|
||||
print('Found arch which did not reach condition at training')
|
||||
continue
|
||||
|
||||
# freeze evaluation
|
||||
#--------------------
|
||||
|
|
Загрузка…
Ссылка в новой задаче