Cleaned up divnas conf. Added more debug information logging to divnas finalizers.

This commit is contained in:
Debadeepta Dey 2020-05-18 14:52:15 -07:00 коммит произвёл Gustavo Rosa
Родитель 0f5270c941
Коммит 982a5ad9b8
3 изменённых файлов: 29 добавлений и 284 удалений

Просмотреть файл

@ -5,8 +5,12 @@ import torch
from torch import nn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from archai.common.common import get_conf
from archai.common.common import get_expdir
from archai.common.common import logger
from archai.datasets.data import get_data
from archai.nas.model import Model
@ -25,7 +29,6 @@ class DivnasFinalizers(Finalizers):
logger.pushd('finalize')
# get config and train data loader
# TODO: confirm this is correct in case you get silent bugs
conf = get_conf()
conf_loader = conf['nas']['search']['loader']
train_dl, val_dl, test_dl = get_data(conf_loader)
@ -53,8 +56,7 @@ class DivnasFinalizers(Finalizers):
for _ in range(1):
for _, (x, _) in enumerate(train_dl):
_, _ = model(x), None
# now you can go through and update the
# node covariances in every cell
#update the node covariances in all cells
for dcell in self._divnas_cells.values():
dcell.update_covs()
@ -66,12 +68,14 @@ class DivnasFinalizers(Finalizers):
@overrides
def finalize_cell(self, cell:Cell, *args, **kwargs)->CellDesc:
# first finalize each node, we will need to recreate node desc with final version
logger.info(f'cell id {cell.desc.id}')
node_descs:List[NodeDesc] = []
dcell = self._divnas_cells[id(cell)]
assert len(cell.dag) == len(list(dcell.node_covs.values()))
for node in cell.dag:
for i, node in enumerate(cell.dag):
node_cov = dcell.node_covs[id(node)]
node_desc = self.finalize_node(node, cell.desc.max_final_edges, node_cov)
logger.info(f'node {i}')
node_desc = self.finalize_node(node, cell.desc.max_final_edges, node_cov, cell.desc.id, i)
node_descs.append(node_desc)
# (optional) clear out all activation collection information
@ -92,7 +96,7 @@ class DivnasFinalizers(Finalizers):
@overrides
def finalize_node(self, node:nn.ModuleList, max_final_edges:int, cov:np.array, *args, **kwargs)->NodeDesc:
def finalize_node(self, node:nn.ModuleList, max_final_edges:int, cov:np.array, cell_id, node_id, *args, **kwargs)->NodeDesc:
# node is a list of edges
assert len(node) >= max_final_edges
@ -125,10 +129,17 @@ class DivnasFinalizers(Finalizers):
for ind in max_subset:
edge_ind, op_ind = edge_num_and_op_ind[ind]
op_desc = node[edge_ind]._op.get_valid_op_desc(op_ind)
logger.info(f'selected edge: {edge_ind}, op: {op_desc.name}')
new_edge = EdgeDesc(op_desc, node[edge_ind].input_ids)
selected_edges.append(new_edge)
# for edge in selected_edges:
# self.finalize_edge(edge)
# save diagnostic information to disk
expdir = get_expdir()
sns.heatmap(cov, annot=True, fmt='.1g', cmap='coolwarm')
savename = os.path.join(expdir, f'cell_{cell_id}_node_{node_id}_cov.png')
plt.savefig(savename)
logger.info('')
return NodeDesc(selected_edges)

Просмотреть файл

@ -129,6 +129,13 @@ class DivOp(Op):
def ops(self)->Iterator['Op']:
return iter(self._ops) # type: ignore
def get_valid_op_desc(self, index:int)->OpDesc:
''' index: index in the valid index list '''
assert index <= self.num_valid_div_ops
orig_index = self._valid_to_orig[index]
desc, _ = self._ops[orig_index].finalize()
return desc
@overrides
def can_drop_path(self) -> bool:
return False

Просмотреть файл

@ -1,285 +1,12 @@
__include__: "../datasets/cifar10.yaml" # default dataset settings are for cifar
# TODO: dey: inherit from darts instead of copy paste, so its easy to see what changed
common:
experiment_name: 'throwaway' # you should supply from command line
experiment_desc: 'throwaway'
logdir: '~/logdir'
seed: 2.0
tb_enable: False # if True then TensorBoard logging is enabled (may impact perf)
tb_dir: '$expdir/tb' # path where tensorboard logs would be stored
checkpoint:
filename: '$expdir/checkpoint.pth'
freq: 10
toy_mode: # this section will be used by toy.yaml to setup the toy mode
max_batches: 4
train_batch: 32
test_batch: 64
seed_train_epochs: 0 # number of epochs in toy mode for seed model training
post_train_epochs: 0 # number of epochs for model generated by search in toy mode
# TODO: workers setting
# reddis address of Ray cluster. Use None for single node run
# otherwise it should something like host:6379. Make sure to run on head node:
# "ray start --head --redis-port=6379"
redis: null
apex: # this is overriden in search and eval individually
enabled: False # global switch to disable everything apex
distributed_enabled: True # enable/disable distributed mode
mixed_prec_enabled: True # switch to disable amp mixed precision
gpus: '' # use GPU IDs specified here (comma separated), if '' then use all GPUs
opt_level: 'O2' # optimization level for mixed precision
bn_fp32: True # keep BN in fp32
loss_scale: "dynamic" # loss scaling mode for mixed prec, must be string reprenting floar ot "dynamic"
sync_bn: False # should be replace BNs with sync BNs for distributed model
scale_lr: True # enable/disable distributed mode
min_world_size: 0 # allows to confirm we are indeed in distributed setting
detect_anomaly: False # if True, PyTorch code will run 6X slower
seed: '_copy: /common/seed'
smoke_test: False
only_eval: False
resume: True
dataset: {} # default dataset settings comes from __include__ on the top
__include__: "darts.yaml" # just use darts defaults
nas:
eval:
full_desc_filename: '$expdir/full_model_desc.yaml' # model desc used for building model for evaluation
final_desc_filename: '$expdir/final_model_desc.yaml' # model desc used as template to construct cells
# If below is specified then final_desc_filename is ignored and model is created through factory function instead.
# This is useful for running eval for manually designed models such as resnet-50.
# The value is string of form 'some.namespace.module.function'. The function returns nn.Module and no required args.
final_model_factory: ''
metric_filename: '$expdir/eval_train_metrics.yaml'
model_filename: '$expdir/model.pt' # file to which trained model will be saved
data_parallel: False
checkpoint:
_copy: '/common/checkpoint'
resume: '_copy: /common/resume'
model_desc:
dataset:
_copy: '/dataset'
max_final_edges: 2 # max edge that can be in final arch per node
cell_post_op: 'concate_channels'
model_stem0_op: 'stem_conv3x3'
model_stem1_op: 'stem_conv3x3'
model_post_op: 'pool_adaptive_avg2d'
aux_tower_stride: 3 # stride that aux tower should use, 3 is good for 32x32 images, 2 for imagenet
stem_multiplier: 3 # output channels multiplier for the stem
params: {}
n_nodes: 4 # number of nodes in a cell
n_reductions: 2 # number of reductions to be applied
init_node_ch: 36 # num of input/output channels for nodes in 1st cell
n_cells: 20 # number of cells
aux_weight: 0.4 # weight for loss from auxiliary towers in test time arch
loader:
apex:
_copy: '../../trainer/apex'
aug: '' # additional augmentations to use
cutout: 16 # cutout length, use cutout augmentation when > 0
load_train: True # load train split of dataset
train_batch: 96
train_workers: 4
test_workers: '_copy: ../train_workers' # if null then 4
load_test: True # load test split of dataset
test_batch: 1024
val_ratio: 0.0 #split portion for test set, 0 to 1
val_fold: 0 #Fold number to use (0 to 4)
cv_num: 5 # total number of folds available
dataset:
_copy: '/dataset'
trainer:
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
drop_path_prob: 0.2 # probability that given edge will be dropped
grad_clip: 5.0 # grads above this value is clipped
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'eval_train'
epochs: 600
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
lossfn:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.025 # init learning rate
decay: 3.0e-4 # pytorch default is 0.0
momentum: 0.9 # pytorch default is 0.0
nesterov: False # pytorch default is False
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
lr_schedule:
type: 'cosine'
min_lr: 0.001 # min learning rate to se bet in eta_min param of scheduler
warmup: # increases LR for 0 to current in specified epochs and then hands over to main scheduler
multiplier: 1
epochs: 0 # 0 disables warmup
validation:
title: 'eval_test'
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
logger_freq: 0
freq: 1 # perform validation only every N epochs
lossfn:
type: 'CrossEntropyLoss'
search:
finalizer: 'random' # options are mutual information based 'mi' or 'random' or 'default'. NOTE: 'default' is not compatible with 'noalpha' trainer as 'default' uses the darts finalizer and needs alphas
finalizer: 'mi' # options are mutual information based 'mi' or 'random' or 'default'. NOTE: 'default' is not compatible with 'noalpha' trainer as 'default' uses the darts finalizer and needs alphas
divnas:
sigma: 168
archtrainer: 'bilevel' # options are 'bilevel', 'noalpha'
data_parallel: False
checkpoint:
_copy: '/common/checkpoint'
resume: '_copy: /common/resume'
search_iters: 1
collect_activations: True
full_desc_filename: '$expdir/full_model_desc.yaml' # arch before it was finalized
final_desc_filename: '$expdir/final_model_desc.yaml' # final arch is saved in this file
metrics_dir: '$expdir/models/{reductions}/{cells}/{nodes}/{search_iter}' # where metrics and model stats would be saved from each pareto iteration
seed_train:
trainer:
_copy: '/nas/eval/trainer'
title: 'seed_train'
epochs: 0 # number of epochs model will be trained before search
aux_weight: 0.0
drop_path_prob: 0.0
loader:
_copy: '/nas/eval/loader'
train_batch: 128
val_ratio: 0.1 #split portion for test set, 0 to 1
post_train:
trainer:
_copy: '/nas/eval/trainer'
title: 'post_train'
epochs: 0 # number of epochs model will be trained after search
aux_weight: 0.0
drop_path_prob: 0.0
loader:
_copy: '/nas/eval/loader'
train_batch: 128
val_ratio: 0.1 #split portion for test set, 0 to 1
pareto:
# default parameters are set so there is exactly one search iteration
max_cells: 8
max_reductions: 2
max_nodes: 4
enabled: False
summary_filename: '$expdir/perito.tsv' # for each iteration of macro, we fave model and perf summary
model_desc:
# we avoid copying from eval node because dataset settings
# may override eval.model_desc with different stems, pool etc
dataset:
_copy: '/dataset'
max_final_edges: 2 # max edge that can be in final arch per node
cell_post_op: 'concate_channels'
model_stem0_op: 'stem_conv3x3'
model_stem1_op: 'stem_conv3x3'
model_post_op: 'pool_adaptive_avg2d'
aux_tower_stride: 3 # stride that aux tower should use, 3 is good for 32x32 images, 2 for imagenet
stem_multiplier: 3 # output channels multiplier for the stem
params: {}
n_nodes: 4 # number of nodes in a cell
n_reductions: 2 # number of reductions to be applied
init_node_ch: 16 # num of input/output channels for nodes in 1st cell
n_cells: 8 # number of cells
aux_weight: 0.0 # weight for loss from auxiliary towers in test time arch
loader:
apex:
_copy: '../../trainer/apex'
aug: '' # additional augmentations to use
cutout: 0 # cutout length, use cutout augmentation when > 0
load_train: True # load train split of dataset
train_batch: 64
train_workers: 4 # if null then gpu_count*4
test_workers: '_copy: ../train_workers' # if null then 4
load_test: False # load test split of dataset
test_batch: 1024
val_ratio: 0.5 #split portion for test set, 0 to 1
val_fold: 0 #Fold number to use (0 to 4)
cv_num: 5 # total number of folds available
dataset:
_copy: '/dataset'
trainer:
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/search/model_desc/aux_weight'
drop_path_prob: 0.0 # probability that given edge will be dropped
grad_clip: 5.0 # grads above this value is clipped
logger_freq: 1000 # after every N updates dump loss and other metrics in logger
title: 'arch_train'
epochs: 50
batch_chunks: 1 # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
# additional vals for the derived class
plotsdir: '' #empty string means no plots, other wise plots are generated for each epoch in this dir
l1_alphas: 0.0 # weight to be applied to sum(abs(alphas)) to loss term
lossfn:
type: 'CrossEntropyLoss'
optimizer:
type: 'sgd'
lr: 0.025 # init learning rate
decay: 3.0e-4
momentum: 0.9 # pytorch default is 0
nesterov: False
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
alpha_optimizer:
type: 'adam'
lr: 3.0e-4
decay: 1.0e-3
betas: [0.5, 0.999]
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
lr_schedule:
type: 'cosine'
min_lr: 0.001 # min learning rate, this will be used in eta_min param of scheduler
warmup: null
validation:
title: 'search_val'
logger_freq: 0
batch_chunks: '_copy: ../../batch_chunks' # split batch into these many chunks and accumulate gradients so we can support GPUs with lower RAM
freq: 1 # perform validation only every N epochs
lossfn:
type: 'CrossEntropyLoss'
epochs: 1
autoaug:
num_op: 2
num_policy: 5
num_search: 200
num_result_per_cv: 10 # after conducting N trials, we will chose the results of top num_result_per_cv
loader:
apex:
_copy: '/common/apex'
aug: '' # additional augmentations to use
cutout: 16 # cutout length, use cutout augmentation when > 0
epochs: 50
load_train: True # load train split of dataset
train_batch: 64
train_workers: 4 # if null then gpu_count*4
test_workers: '_copy: ../train_workers' # if null then 4
load_test: True # load test split of dataset
test_batch: 1024
val_ratio: 0.4 #split portion for test set, 0 to 1
val_fold: 0 #Fold number to use (0 to 4)
cv_num: 5 # total number of folds available
dataset:
_copy: '/dataset'
optimizer:
type: 'sgd'
lr: 0.025 # init learning rate
decay: 3.0e-4 # pytorch default is 0.0
momentum: 0.9 # pytorch default is 0.0
nesterov: False # pytorch default is False
clip: 5.0 # grads above this value is clipped # TODO: Why is this also in trainer?
decay_bn: .NaN # if NaN then same as decay otherwise apply different decay to BN layers
#betas: [0.9, 0.999] # PyTorch default betas for Adam
lr_schedule:
type: 'cosine'
min_lr: 0.0 # min learning rate, this will be used in eta_min param of scheduler
warmup: null