Getting ready to run large batch cell13 natsbench without augmentation.

This commit is contained in:
Debadeepta Dey 2021-01-22 14:35:55 -08:00 коммит произвёл Gustavo Rosa
Родитель 247a2764b9
Коммит ef0d9e1e03
6 изменённых файлов: 19 добавлений и 37 удалений

2
.vscode/launch.json поставляемый
Просмотреть файл

@ -469,7 +469,7 @@
"request": "launch",
"program": "${cwd}/scripts/reports/analysis_freeze_natsbench_space.py",
"console": "integratedTerminal",
"args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxynas_nb_cell13_freeze0.8",
"args": ["--results-dir", "C:\\Users\\dedey\\Documents\\archaiphilly\\phillytools\\proxy_nb_fast_cell13_freeze0.6_60",
"--out-dir", "C:\\Users\\dedey\\archai_experiment_reports"]
},
{

Просмотреть файл

@ -60,7 +60,8 @@ class FreezeNatsbenchExperimentRunner(ExperimentRunner):
logger.pushd('regular_evaluate')
arch_id = conf_eval['natsbench']['arch_index']
dataroot = utils.full_path(conf_eval['loader']['dataset']['dataroot'])
natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])
natsbench_location = os.path.join(dataroot, 'natsbench', conf_eval['natsbench']['natsbench_tss_fast'])
logger.info(natsbench_location)
dataset_name = conf_eval['loader']['dataset']['name']
api = create(natsbench_location, 'tss', fast_mode=True, verbose=True)

Просмотреть файл

@ -13,8 +13,8 @@ nas:
model_desc:
num_edges_to_sample: 2
loader:
train_batch: 2048 # natsbench uses 256
aug: 'fa_reduced_cifar10' # In natsbench paper they use random flip and crop, we are doing lot more here
train_batch: 1024 # natsbench uses 256
aug: '' # random flip and crop are already there in default params
trainer: # matching natsbench paper closely
plotsdir: ''
apex:

Просмотреть файл

@ -8,7 +8,7 @@ nas:
eval:
nasbench101:
arch_index: 1891
arch_index: 6758
model_desc:
num_edges_to_sample: 2
loader:
@ -20,7 +20,7 @@ nas:
train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
trainer:
plotsdir: ''
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'

Просмотреть файл

@ -8,20 +8,20 @@ nas:
eval:
natsbench:
arch_index: 1891
arch_index: 6758
natsbench_tss_fast: 'NATS-tss-v1_0-3ffb9-simple' # folder name in dataroot/natsbench that contains the tss fast mode folder
model_desc:
num_edges_to_sample: 2
loader:
train_batch: 1024 # 1024 works reliably on V100. 2048 causes issues.
aug: 'fa_reduced_cifar10' # in natsbench paper they use random flip and crop, we are doing lot more here
train_batch: 1024 # 1024 and 2048 may be causing hang issues on cluster!
aug: '' # in natsbench paper they use random flip and crop, which are part of the regular transforms
naswotrain:
train_batch: 256 # batch size for computing trainingless score
freeze_loader:
train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
trainer:
plotsdir: ''
val_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
val_top1_acc_threshold: 0.6 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
@ -36,7 +36,7 @@ nas:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.1 # init learning rate
lr: 0.5 # init learning rate
decay: 5.0e-4 # pytorch default is 0.0
momentum: 0.9 # pytorch default is 0.0
nesterov: True # pytorch default is False

Просмотреть файл

@ -93,31 +93,7 @@ def main():
for key, data in a:
logs[key] = data
# # single process parsing of yaml logs
# for job_dir in tqdm(results_dir.iterdir()):
# if job_dir.is_dir():
# for subdir in job_dir.iterdir():
# if not subdir.is_dir():
# continue
# # currently we expect that each job was ExperimentRunner job which should have
# # _search or _eval folders
# if subdir.stem.endswith('_search'):
# sub_job = 'search'
# elif subdir.stem.endswith('_eval'):
# sub_job = 'eval'
# else:
# raise RuntimeError(f'Sub directory "{subdir}" in job "{job_dir}" must '
# 'end with either _search or _eval which '
# 'should be the case if ExperimentRunner was used.')
# logs_filepath = os.path.join(str(subdir), 'log.yaml')
# if os.path.isfile(logs_filepath):
# fix_yaml(logs_filepath)
# with open(logs_filepath, 'r') as f:
# key = job_dir.name + ':' + sub_job
# logs[key] = yaml.load(f, Loader=yaml.Loader)
# examples of accessing logs
# logs['proxynas_blahblah:eval']['naswotrain_evaluate']['eval_arch']['eval_train']['naswithouttraining']
# logs['proxynas_blahblah:eval']['regular_evaluate']['regtrainingtop1']
@ -140,7 +116,12 @@ def main():
if 'eval' in key:
try:
# TODO: if at the end of conditional training val accuracy has not gone above target then don't consider it
# if at the end of conditional training val accuracy has not gone above target then don't consider it
last_cond_epoch_key = list(logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'].keys())[-1]
val_end_cond = logs[key]['freeze_evaluate']['eval_arch']['conditional_training']['eval_train']['epochs'][last_cond_epoch_key]['val']['top1']
if val_end_cond < 0.6:
print('Found arch which did not reach condition at training')
continue
# freeze evaluation
#--------------------