From ef0d9e1e0382e2da2dfee08b42735ca4b4ec3d9b Mon Sep 17 00:00:00 2001 From: Debadeepta Dey Date: Fri, 22 Jan 2021 14:35:55 -0800 Subject: [PATCH] Getting ready to run large batch cell13 natsbench without augmentation. --- .vscode/launch.json | 2 +- .../freeze_natsbench_experiment_runner.py | 3 +- confs/algos/natsbench_regular_eval.yaml | 4 +-- confs/algos/proxynas_nasbench101_space.yaml | 4 +-- confs/algos/proxynas_natsbench_space.yaml | 10 +++--- .../analysis_freeze_natsbench_space.py | 33 ++++--------------- 6 files changed, 19 insertions(+), 37 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 27ca9277..c180f6be 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -469,7 +469,7 @@ "request": "launch", "program": "${cwd}/scripts/reports/analysis_freeze_natsbench_space.py", "console": "integratedTerminal", - "args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxynas_nb_cell13_freeze0.8", + "args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxy_nb_fast_cell13_freeze0.6_60", "--out-dir", "C:\\Users\\dedey\\archai_experiment_reports"] }, { diff --git a/archai/algos/proxynas/freeze_natsbench_experiment_runner.py b/archai/algos/proxynas/freeze_natsbench_experiment_runner.py index 9887e1ff..61a0718e 100644 --- a/archai/algos/proxynas/freeze_natsbench_experiment_runner.py +++ b/archai/algos/proxynas/freeze_natsbench_experiment_runner.py @@ -60,7 +60,8 @@ class FreezeNatsbenchExperimentRunner(ExperimentRunner): logger.pushd('regular_evaluate') arch_id = conf_eval['natsbench']['arch_index'] dataroot = utils.full_path(conf_eval['loader']['dataset']['dataroot']) - natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast']) + natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast']) + logger.info(natsbench_location) dataset_name = conf_eval['loader']['dataset']['name'] api = create(natsbench_location, 'tss', fast_mode=True, verbose=True) diff --git a/confs/algos/natsbench_regular_eval.yaml b/confs/algos/natsbench_regular_eval.yaml index 9612e03b..28c406f8 100644 --- a/confs/algos/natsbench_regular_eval.yaml +++ b/confs/algos/natsbench_regular_eval.yaml @@ -13,8 +13,8 @@ nas: model_desc: num_edges_to_sample: 2 loader: - train_batch: 2048 # natsbench uses 256 - aug: 'fa_reduced_cifar10' # In natsbench paper they use random flip and crop, we are doing lot more here + train_batch: 1024 # natsbench uses 256 + aug: '' # random flip and crop are already there in default params trainer: # matching natsbench paper closely plotsdir: '' apex: diff --git a/confs/algos/proxynas_nasbench101_space.yaml b/confs/algos/proxynas_nasbench101_space.yaml index 2eb4d171..f5662283 100644 --- a/confs/algos/proxynas_nasbench101_space.yaml +++ b/confs/algos/proxynas_nasbench101_space.yaml @@ -8,7 +8,7 @@ nas: eval: nasbench101: - arch_index: 1891 + arch_index: 6758 model_desc: num_edges_to_sample: 2 loader: @@ -20,7 +20,7 @@ nas: train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti. trainer: plotsdir: '' - val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers + val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers apex: _copy: '/common/apex' aux_weight: '_copy: /nas/eval/model_desc/aux_weight' diff --git a/confs/algos/proxynas_natsbench_space.yaml b/confs/algos/proxynas_natsbench_space.yaml index 8650d4d8..57775740 100644 --- a/confs/algos/proxynas_natsbench_space.yaml +++ b/confs/algos/proxynas_natsbench_space.yaml @@ -8,20 +8,20 @@ nas: eval: natsbench: - arch_index: 1891 + arch_index: 6758 natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder model_desc: num_edges_to_sample: 2 loader: - train_batch: 1024 # 1024 works reliably on V100. 2048 causes issues. - aug: 'fa_reduced_cifar10' # in natsbench paper they use random flip and crop, we are doing lot more here + train_batch: 1024 # 1024 and 2048 may be causing hang issues on cluster! + aug: '' # in natsbench paper they use random flip and crop, which are part of the regular transforms naswotrain: train_batch: 256 # batch size for computing trainingless score freeze_loader: train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen trainer: plotsdir: '' - val_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers + val_top1_acc_threshold: 0.6 # after some accuracy we will shift into training only the last 'n' layers apex: _copy: '/common/apex' aux_weight: '_copy: /nas/eval/model_desc/aux_weight' @@ -36,7 +36,7 @@ nas: type: 'CrossEntropyLoss' optimizer: type: 'sgd' - lr: 0.1 # init learning rate + lr: 0.5 # init learning rate decay: 5.0e-4 # pytorch default is 0.0 momentum: 0.9 # pytorch default is 0.0 nesterov: True # pytorch default is False diff --git a/scripts/reports/analysis_freeze_natsbench_space.py b/scripts/reports/analysis_freeze_natsbench_space.py index fc6abd74..8d94cd1c 100644 --- a/scripts/reports/analysis_freeze_natsbench_space.py +++ b/scripts/reports/analysis_freeze_natsbench_space.py @@ -93,31 +93,7 @@ def main(): for key, data in a: logs[key] = data - # # single process parsing of yaml logs - # for job_dir in tqdm(results_dir.iterdir()): - # if job_dir.is_dir(): - # for subdir in job_dir.iterdir(): - # if not subdir.is_dir(): - # continue - # # currently we expect that each job was ExperimentRunner job which should have - # # _search or _eval folders - # if subdir.stem.endswith('_search'): - # sub_job = 'search' - # elif subdir.stem.endswith('_eval'): - # sub_job = 'eval' - # else: - # raise RuntimeError(f'Sub directory "{subdir}" in job "{job_dir}" must ' - # 'end with either _search or _eval which ' - # 'should be the case if ExperimentRunner was used.') - - # logs_filepath = os.path.join(str(subdir), 'log.yaml') - # if os.path.isfile(logs_filepath): - # fix_yaml(logs_filepath) - # with open(logs_filepath, 'r') as f: - # key = job_dir.name + ':' + sub_job - # logs[key] = yaml.load(f, Loader=yaml.Loader) - - + # examples of accessing logs # logs['proxynas_blahblah:eval']['naswotrain_evaluate']['eval_arch']['eval_train']['naswithouttraining'] # logs['proxynas_blahblah:eval']['regular_evaluate']['regtrainingtop1'] @@ -140,7 +116,12 @@ def main(): if 'eval' in key: try: - # TODO: if at the end of conditional training val accuracy has not gone above target then don't consider it + # if at the end of conditional training val accuracy has not gone above target then don't consider it + last_cond_epoch_key = list(logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'].keys())[-1] + val_end_cond = logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'][last_cond_epoch_key]['val']['top1'] + if val_end_cond < 0.6: + print('Found arch which did not reach condition at training') + continue # freeze evaluation #--------------------