Nominal implementation of darcyflow with currently missing target decoder in the training loop.

This commit is contained in:
Debadeepta Dey 2021-12-06 18:14:11 -08:00 коммит произвёл Gustavo Rosa
Родитель e5d2c8afc6
Коммит 0f6d842165
10 изменённых файлов: 308 добавлений и 3 удалений

8
.vscode/launch.json поставляемый
Просмотреть файл

@ -222,6 +222,14 @@
"console": "integratedTerminal",
"args": ["--full", "--algos", "darts_space_constant_random_archs", "--datasets", "cifar100"]
},
{
"name": "Darts Space Constant Random Archs Darcyflow",
"type": "python",
"request": "launch",
"program": "${cwd}/scripts/main.py",
"console": "integratedTerminal",
"args": ["--full", "--algos", "darts_space_constant_random_archs_darcyflow", "--datasets", "darcyflow"]
},
{
"name": "Proxynas-Darts-Space-Full",
"type": "python",

Просмотреть файл

@ -25,7 +25,7 @@ from archai.common.checkpoint import CheckPoint
from archai.common.ml_utils import set_optim_lr
from archai.datasets import data
TFreezeTrainer = Optional[Type['ConditionalTrainer']]
TConditionalTrainer = Optional[Type['ConditionalTrainer']]
class ConditionalTrainer(ArchTrainer, EnforceOverrides):

Просмотреть файл

@ -0,0 +1,54 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from archai.nas.evaluater import EvalResult
from typing import Type
from copy import deepcopy
from overrides import overrides
from archai.common.config import Config
from archai.nas import nas_utils
from archai.nas.exp_runner import ExperimentRunner
from archai.nas.arch_trainer import ArchTrainer, TArchTrainer
from archai.nas.evaluater import Evaluater, EvalResult
from archai.common.common import get_expdir, logger
from archai.algos.random_sample_darts_space.random_model_desc_builder import RandomModelDescBuilder
from archai.algos.proxynas.freeze_manual_searcher import ManualFreezeSearcher
from archai.algos.random_sample_darts_space.darts_space_darcyflow_evaluater import DartsSpaceDarcyflowEvaluater
class DartsSpaceConstantRandomArchsDarcyflowExpRunner(ExperimentRunner):
''' Samples a reproducible random architecture from
DARTS search space and trains it but is specific to DarcyFlow dataset
since it is a 2D input, dense 2D output task '''
@overrides
def model_desc_builder(self)->RandomModelDescBuilder:
return RandomModelDescBuilder()
@overrides
def trainer_class(self)->TArchTrainer:
return None
@overrides
def searcher(self)->ManualFreezeSearcher:
return ManualFreezeSearcher() # no searcher basically
@overrides
def copy_search_to_eval(self)->None:
pass
@overrides
def run_eval(self, conf_eval:Config)->EvalResult:
# regular evaluation of the architecture
# this is expensive
# --------------------------------------
logger.pushd('regular_evaluate')
evaler = DartsSpaceDarcyflowEvaluater()
conf_eval_reg = deepcopy(conf_eval)
reg_eval_result = evaler.evaluate(conf_eval_reg, model_desc_builder=self.model_desc_builder())
logger.popd()
return reg_eval_result

Просмотреть файл

@ -0,0 +1,57 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from copy import deepcopy
from typing import Optional
import importlib
import sys
import string
import os
from overrides import overrides
import torch
from torch import nn
from overrides import overrides, EnforceOverrides
from archai.common.trainer import Trainer
from archai.common.config import Config
from archai.common.common import logger
from archai.datasets import data
from archai.nas.model_desc import ModelDesc
from archai.nas.model_desc_builder import ModelDescBuilder
from archai.nas.nas_utils import create_nb301_genotype_from_desc
from archai.nas import nas_utils
from archai.common import ml_utils, utils
from archai.common.metrics import EpochMetrics, Metrics
from archai.nas.model import Model
from archai.common.checkpoint import CheckPoint
from archai.nas.evaluater import Evaluater
from archai.algos.proxynas.freeze_trainer import FreezeTrainer
from archai.algos.proxynas.conditional_trainer import ConditionalTrainer
from archai.algos.random_sample_darts_space.constant_darts_space_sampler import ConstantDartsSpaceSampler
from archai.algos.random_sample_darts_space.random_model_desc_builder import RandomModelDescBuilder
from archai.algos.random_sample_darts_space.darts_space_evaluater import DartsSpaceEvaluater
from archai.nas.darcyflow_trainer import DarcyflowTrainer
class DartsSpaceDarcyflowEvaluater(DartsSpaceEvaluater):
@overrides
def train_model(self, conf_train:Config, model:nn.Module,
checkpoint:Optional[CheckPoint])->Metrics:
conf_loader = conf_train['loader']
conf_train = conf_train['trainer']
# only darcyflow works with this evaluater
if conf_loader['dataset']['name'] != 'darcyflow':
raise TypeError
# get data
data_loaders = self.get_data(conf_loader)
# the trainer class is the only difference
trainer = DarcyflowTrainer(conf_train, model, checkpoint)
train_metrics = trainer.fit(data_loaders)
return train_metrics

Просмотреть файл

@ -101,4 +101,53 @@ class LabelSmoothing(nn.Module):
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
return loss.mean()
# Credits: https://github.com/rtu715/NAS-Bench-360/blob/d075006848c664371855c34082b0a00cda62be67/darts/gaea-dense/utils.py#L126
class LpLoss(object):
''' loss function with rel/abs Lp los '''
def __init__(self, d=2, p=2, size_average=True, reduction=True):
super(LpLoss, self).__init__()
#Dimension and Lp-norm type are postive
assert d > 0 and p > 0
self.d = d
self.p = p
self.reduction = reduction
self.size_average = size_average
def abs(self, x, y):
num_examples = x.size()[0]
#Assume uniform mesh
h = 1.0 / (x.size()[1] - 1.0)
all_norms = (h**(self.d/self.p))*torch.norm(x.view(num_examples,-1) - y.view(num_examples,-1), self.p, 1)
if self.reduction:
if self.size_average:
return torch.mean(all_norms)
else:
return torch.sum(all_norms)
return all_norms
def rel(self, x, y):
num_examples = x.size()[0]
diff_norms = torch.norm(x.reshape(num_examples,-1) - y.reshape(num_examples,-1), self.p, 1)
y_norms = torch.norm(y.reshape(num_examples,-1), self.p, 1)
if self.reduction:
if self.size_average:
return torch.mean(diff_norms/y_norms)
else:
return torch.sum(diff_norms/y_norms)
return diff_norms/y_norms
def __call__(self, x, y):
return self.rel(x, y)

Просмотреть файл

@ -167,6 +167,7 @@ class DarcyflowProvider(DatasetProvider):
def __init__(self, conf_dataset:Config):
super().__init__(conf_dataset)
self._dataroot = utils.full_path(conf_dataset['dataroot'])
self._sub = conf_dataset['sub']
@overrides
def get_datasets(self, load_train:bool, load_test:bool,
@ -176,7 +177,7 @@ class DarcyflowProvider(DatasetProvider):
path_to_data = os.path.join(self._dataroot, 'darcyflow')
# load the dataset but without any validation split
trainset, testset = load_darcyflow(path_to_data)
trainset, testset = load_darcyflow(path_to_data, self._sub)
return trainset, testset

Просмотреть файл

@ -0,0 +1,102 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Callable, Tuple, Optional, Type
import torch
from torch import nn, Tensor
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
from overrides import EnforceOverrides
from archai.common.metrics import Metrics
from archai.common.tester import Tester
from archai.common.config import Config
from archai.common import utils, ml_utils
from archai.common.common import logger
from archai.datasets import data
from archai.common.checkpoint import CheckPoint
from archai.common.apex_utils import ApexUtils
from archai.common.multi_optim import MultiOptim, OptimSched
from archai.nas.nas_utils import get_model_stats
from archai.nas.arch_trainer import ArchTrainer
from archai.datasets.providers.darcyflow_provider import UnitGaussianNormalizer
TDarcyflowTrainer = Optional[Type['DarcyflowTrainer']]
class DarcyflowTrainer(ArchTrainer, EnforceOverrides):
def __init__(self, conf_train: Config, model: nn.Module,
checkpoint:Optional[CheckPoint]) -> None:
super().__init__(conf_train, model, checkpoint)
# region config vars specific to Darcyflow trainer
# endregion
def _train_epoch(self, train_dl: DataLoader)->None:
steps = len(train_dl)
self.model.train()
logger.pushd('steps')
for step, (x, y) in enumerate(train_dl):
logger.pushd(step)
assert self.model.training # derived class might alter the mode
# TODO: please check that no algorithm is invalidated by swapping prestep with zero grad
self._multi_optim.zero_grad()
self.pre_step(x, y)
# divide batch in to chunks if needed so it fits in GPU RAM
if self.batch_chunks > 1:
x_chunks, y_chunks = torch.chunk(x, self.batch_chunks), torch.chunk(y, self.batch_chunks)
else:
x_chunks, y_chunks = (x,), (y,)
logits_chunks = []
loss_sum, loss_count = 0.0, 0
for xc, yc in zip(x_chunks, y_chunks):
xc, yc = xc.to(self.get_device(), non_blocking=True), yc.to(self.get_device(), non_blocking=True)
logits_c, aux_logits = self.model(xc), None
tupled_out = isinstance(logits_c, Tuple) and len(logits_c) >=2
if tupled_out: # then we are using model created by desc
logits_c, aux_logits = logits_c[0], logits_c[1]
# darcyflow specific line
logits = logits.squeeze()
# WARNING, DEBUG: Making code run through for now
# this is missing all the y's decoding
loss_c = self.compute_loss(self._lossfn, yc, logits_c,
self._aux_weight, aux_logits)
self._apex.backward(loss_c, self._multi_optim)
loss_sum += loss_c.item() * len(logits_c)
loss_count += len(logits_c)
logits_chunks.append(logits_c.detach().cpu())
# TODO: original darts clips alphas as well but pt.darts doesn't
self._apex.clip_grad(self._grad_clip, self.model, self._multi_optim)
self._multi_optim.step()
# TODO: we possibly need to sync so all replicas are upto date
self._apex.sync_devices()
self.post_step(x, y,
ml_utils.join_chunks(logits_chunks),
torch.tensor(loss_sum/loss_count),
steps)
logger.popd()
# end of step
self._multi_optim.epoch()
logger.popd()

Просмотреть файл

@ -0,0 +1,23 @@
__include__: 'darts.yaml' # just use darts defaults
nas:
search:
model_desc:
num_edges_to_sample: 2 # number of edges each node will take input from
eval:
dartsspace:
arch_index: 66
model_desc:
aux_weight: False # AuxTower class assumes specific input size hence breaks with many datasets.
num_edges_to_sample: 2
n_cells: 8
loader:
aug: ''
cutout: -1 # cutout length, use cutout augmentation when > 0
val_ratio: 0.0
train_batch: 96
trainer:
use_val: False
plotsdir: ''
epochs: 100

Просмотреть файл

@ -0,0 +1,9 @@
__include__: './dataroot.yaml' # default dataset settings are for cifar
dataset:
name: 'darcyflow'
n_classes: 1 # not a classification task but a convenient hook. Imagine you have a 3x85x85 input and you want 1x85x85. (Do we need this?)
channels: 3 # number of channels in image
max_batches: -1 # if >= 0 then only these many batches are generated (useful for debugging)
sub: 5 # parameter for creating grid used to process data
storage_name: 'darcyflow' # name of folder or tar file to copy from cloud storage

Просмотреть файл

@ -15,6 +15,7 @@ from archai.algos.gumbelsoftmax.gs_exp_runner import GsExperimentRunner
from archai.algos.divnas.divnas_exp_runner import DivnasExperimentRunner
from archai.algos.didarts.didarts_exp_runner import DiDartsExperimentRunner
from archai.algos.random_sample_darts_space.darts_space_constant_random_archs_exp_runner import DartsSpaceConstantRandomArchsExperimentRunner
from archai.algos.random_sample_darts_space.darts_space_constant_random_archs_darcyflow_exp_runner import DartsSpaceConstantRandomArchsDarcyflowExpRunner
from archai.algos.proxynas.freeze_darts_space_experiment_runner import FreezeDartsSpaceExperimentRunner
from archai.algos.proxynas.freeze_natsbench_experiment_runner import FreezeNatsbenchExperimentRunner
from archai.algos.proxynas.freeze_natsbench_sss_experiment_runner import FreezeNatsbenchSSSExperimentRunner
@ -52,6 +53,7 @@ def main():
'divnas': DivnasExperimentRunner,
'didarts': DiDartsExperimentRunner,
'darts_space_constant_random_archs': DartsSpaceConstantRandomArchsExperimentRunner,
'darts_space_constant_random_archs_darcyflow': DartsSpaceConstantRandomArchsDarcyflowExpRunner,
'proxynas_darts_space': FreezeDartsSpaceExperimentRunner,
'proxynas_natsbench_space': FreezeNatsbenchExperimentRunner,
'proxynas_natsbench_sss_space': FreezeNatsbenchSSSExperimentRunner,