Getting ready to run large batch cell13 natsbench without augmentation.

2021-01-22 14:35:55 -08:00 · 2021-01-22 14:35:55 -08:00 · ef0d9e1e03
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -469,7 +469,7 @@
            "request": "launch",
            "program": "${cwd}/scripts/reports/analysis_freeze_natsbench_space.py",
            "console": "integratedTerminal",
-            "args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxynas_nb_cell13_freeze0.8",
+            "args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxy_nb_fast_cell13_freeze0.6_60",
            "--out-dir", "C:\\Users\\dedey\\archai_experiment_reports"]
        },
        {
--- a/archai/algos/proxynas/freeze_natsbench_experiment_runner.py
+++ b/archai/algos/proxynas/freeze_natsbench_experiment_runner.py
@ -60,7 +60,8 @@ class FreezeNatsbenchExperimentRunner(ExperimentRunner):
        logger.pushd('regular_evaluate')
        arch_id = conf_eval['natsbench']['arch_index']
        dataroot = utils.full_path(conf_eval['loader']['dataset']['dataroot'])    
-        natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])        
+        natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])
+        logger.info(natsbench_location)        
        dataset_name = conf_eval['loader']['dataset']['name']

        api = create(natsbench_location, 'tss', fast_mode=True, verbose=True)
--- a/confs/algos/natsbench_regular_eval.yaml
+++ b/confs/algos/natsbench_regular_eval.yaml
@ -13,8 +13,8 @@ nas:
    model_desc:
      num_edges_to_sample: 2
    loader:
-      train_batch: 2048 # natsbench uses 256
-      aug: 'fa_reduced_cifar10' # In natsbench paper they use random flip and crop, we are doing lot more here     
+      train_batch: 1024 # natsbench uses 256
+      aug: '' # random flip and crop are already there in default params
    trainer: # matching natsbench paper closely
      plotsdir: ''
      apex:
--- a/confs/algos/proxynas_nasbench101_space.yaml
+++ b/confs/algos/proxynas_nasbench101_space.yaml
@ -8,7 +8,7 @@ nas:

  eval:
    nasbench101:
-      arch_index: 1891
+      arch_index: 6758
    model_desc:
      num_edges_to_sample: 2
    loader:
@ -20,7 +20,7 @@ nas:
        train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
    trainer:
      plotsdir: ''
-      val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
+      val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
      apex:
        _copy: '/common/apex'
      aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
--- a/confs/algos/proxynas_natsbench_space.yaml
+++ b/confs/algos/proxynas_natsbench_space.yaml
@ -8,20 +8,20 @@ nas:

  eval:
    natsbench:
-      arch_index: 1891
+      arch_index: 6758
      natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
    model_desc:
      num_edges_to_sample: 2
    loader:
-      train_batch: 1024 # 1024 works reliably on V100. 2048 causes issues.
-      aug: 'fa_reduced_cifar10' # in natsbench paper they use random flip and crop, we are doing lot more here
+      train_batch: 1024 # 1024 and 2048 may be causing hang issues on cluster!
+      aug: '' # in natsbench paper they use random flip and crop, which are part of the regular transforms
      naswotrain:
        train_batch: 256 # batch size for computing trainingless score
      freeze_loader:
        train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
    trainer:
      plotsdir: ''
-      val_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
+      val_top1_acc_threshold: 0.6 # after some accuracy we will shift into training only the last 'n' layers
      apex:
        _copy: '/common/apex'
      aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
@ -36,7 +36,7 @@ nas:
        type: 'CrossEntropyLoss'
      optimizer:
        type: 'sgd'
-        lr: 0.1 # init learning rate
+        lr: 0.5 # init learning rate
        decay: 5.0e-4 # pytorch default is 0.0
        momentum: 0.9 # pytorch default is 0.0
        nesterov: True # pytorch default is False
--- a/scripts/reports/analysis_freeze_natsbench_space.py
+++ b/scripts/reports/analysis_freeze_natsbench_space.py
@ -93,31 +93,7 @@ def main():
    for key, data in a:
        logs[key] = data

-    # # single process parsing of yaml logs
-    # for job_dir in tqdm(results_dir.iterdir()):
-    #     if job_dir.is_dir():
-    #         for subdir in job_dir.iterdir():
-    #             if not subdir.is_dir():
-    #                 continue
-    #             # currently we expect that each job was ExperimentRunner job which should have
-    #             # _search or _eval folders
-    #             if subdir.stem.endswith('_search'):
-    #                 sub_job = 'search'
-    #             elif subdir.stem.endswith('_eval'):
-    #                 sub_job = 'eval'
-    #             else:
-    #                 raise RuntimeError(f'Sub directory "{subdir}" in job "{job_dir}" must '
-    #                                 'end with either _search or _eval which '
-    #                                 'should be the case if ExperimentRunner was used.')
-
-    #             logs_filepath = os.path.join(str(subdir), 'log.yaml')
-    #             if os.path.isfile(logs_filepath):
-    #                 fix_yaml(logs_filepath)
-    #                 with open(logs_filepath, 'r') as f:
-    #                     key = job_dir.name + ':' + sub_job
-    #                     logs[key] = yaml.load(f, Loader=yaml.Loader)
-                        
-
+                   
    # examples of accessing logs
    # logs['proxynas_blahblah:eval']['naswotrain_evaluate']['eval_arch']['eval_train']['naswithouttraining']
    # logs['proxynas_blahblah:eval']['regular_evaluate']['regtrainingtop1']
@ -140,7 +116,12 @@ def main():
        if 'eval' in key:
            try:

-                # TODO: if at the end of conditional training val accuracy has not gone above target then don't consider it
+                # if at the end of conditional training val accuracy has not gone above target then don't consider it
+                last_cond_epoch_key = list(logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'].keys())[-1]
+                val_end_cond = logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'][last_cond_epoch_key]['val']['top1']
+                if val_end_cond < 0.6:
+                    print('Found arch which did not reach condition at training')
+                    continue                

                # freeze evaluation 
                #--------------------