diff --git a/nni/algorithms/nas/__init__.py b/nni/algorithms/nas/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/nni/algorithms/nas/pytorch/__init__.py b/nni/algorithms/nas/pytorch/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/nni/algorithms/nas/pytorch/cdarts/__init__.py b/nni/algorithms/nas/pytorch/cdarts/__init__.py deleted file mode 100644 index ab34902e0..000000000 --- a/nni/algorithms/nas/pytorch/cdarts/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator -from .trainer import CdartsTrainer diff --git a/nni/algorithms/nas/pytorch/cdarts/mutator.py b/nni/algorithms/nas/pytorch/cdarts/mutator.py deleted file mode 100644 index a0bf79040..000000000 --- a/nni/algorithms/nas/pytorch/cdarts/mutator.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch - -from apex.parallel import DistributedDataParallel # pylint: disable=import-error -from nni.algorithms.nas.pytorch.darts import DartsMutator # pylint: disable=wrong-import-order -from nni.nas.pytorch.mutables import LayerChoice # pylint: disable=wrong-import-order -from nni.nas.pytorch.mutator import Mutator # pylint: disable=wrong-import-order - - -class RegularizedDartsMutator(DartsMutator): - """ - This is :class:`~nni.algorithms.nas.pytorch.darts.DartsMutator` basically, with two differences. - - 1. Choices can be cut (bypassed). This is done by ``cut_choices``. Cutted choices will not be used in - forward pass and thus consumes no memory. - - 2. Regularization on choices, to prevent the mutator from overfitting on some choices. - """ - - def reset(self): - """ - Warnings - -------- - Renamed :func:`~reset_with_loss` to return regularization loss on reset. - """ - raise ValueError("You should probably call `reset_with_loss`.") - - def cut_choices(self, cut_num=2): - """ - Cut the choices with the smallest weights. - ``cut_num`` should be the accumulative number of cutting, e.g., if first time cutting - is 2, the second time should be 4 to cut another two. - - Parameters - ---------- - cut_num : int - Number of choices to cut, so far. - - Warnings - -------- - Though the parameters are set to :math:`-\infty` to be bypassed, they will still receive gradient of 0, - which introduced ``nan`` problem when calling ``optimizer.step()``. To solve this issue, a simple way is to - reset nan to :math:`-\infty` each time after the parameters are updated. - """ - # `cut_choices` is implemented but not used in current implementation of CdartsTrainer - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - _, idx = torch.topk(-self.choices[mutable.key], cut_num) - with torch.no_grad(): - for i in idx: - self.choices[mutable.key][i] = -float("inf") - - def reset_with_loss(self): - """ - Resample and return loss. If loss is 0, to avoid device issue, it will return ``None``. - - Currently loss penalty are proportional to the L1-norm of parameters corresponding - to modules if their type name contains certain substrings. These substrings include: ``poolwithoutbn``, - ``identity``, ``dilconv``. - """ - self._cache, reg_loss = self.sample_search() - return reg_loss - - def sample_search(self): - result = super().sample_search() - loss = [] - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - def need_reg(choice): - return any(t in str(type(choice)).lower() for t in ["poolwithoutbn", "identity", "dilconv"]) - - for i, choice in enumerate(mutable.choices): - if need_reg(choice): - norm = torch.abs(self.choices[mutable.key][i]) - if norm < 1E10: - loss.append(norm) - if not loss: - return result, None - return result, sum(loss) - - def export(self, logger=None): - """ - Export an architecture with logger. Genotype will be printed with logger. - - Returns - ------- - dict - A mapping from mutable keys to decisions. - """ - result = self.sample_final() - if hasattr(self.model, "plot_genotype") and logger is not None: - genotypes = self.model.plot_genotype(result, logger) - return result, genotypes - - -class RegularizedMutatorParallel(DistributedDataParallel): - """ - Parallelize :class:`~RegularizedDartsMutator`. - - This makes :func:`~RegularizedDartsMutator.reset_with_loss` method parallelized, - also allowing :func:`~RegularizedDartsMutator.cut_choices` and :func:`~RegularizedDartsMutator.export` - to be easily accessible. - """ - def reset_with_loss(self): - """ - Parallelized :func:`~RegularizedDartsMutator.reset_with_loss`. - """ - result = self.module.reset_with_loss() - self.callback_queued = False - return result - - def cut_choices(self, *args, **kwargs): - """ - Parallelized :func:`~RegularizedDartsMutator.cut_choices`. - """ - self.module.cut_choices(*args, **kwargs) - - def export(self, logger): - """ - Parallelized :func:`~RegularizedDartsMutator.export`. - """ - return self.module.export(logger) - - -class DartsDiscreteMutator(Mutator): - """ - A mutator that applies the final sampling result of a parent mutator on another model to train. - - Parameters - ---------- - model : nn.Module - The model to apply the mutator. - parent_mutator : nni.nas.pytorch.mutator.Mutator - The mutator that provides ``sample_final`` method, that will be called to get the architecture. - """ - def __init__(self, model, parent_mutator): - super().__init__(model) - self.__dict__["parent_mutator"] = parent_mutator # avoid parameters to be included - - def sample_search(self): - return self.parent_mutator.sample_final() diff --git a/nni/algorithms/nas/pytorch/cdarts/trainer.py b/nni/algorithms/nas/pytorch/cdarts/trainer.py deleted file mode 100644 index 1a5174216..000000000 --- a/nni/algorithms/nas/pytorch/cdarts/trainer.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging -import os - -import torch -import torch.nn as nn -import torch.nn.functional as F -import apex # pylint: disable=import-error -from apex.parallel import DistributedDataParallel # pylint: disable=import-error -from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator # pylint: disable=wrong-import-order -from nni.nas.pytorch.utils import AverageMeterGroup # pylint: disable=wrong-import-order - -from .utils import CyclicIterator, TorchTensorEncoder, accuracy, reduce_metrics - -PHASE_SMALL = "small" -PHASE_LARGE = "large" - - -class InteractiveKLLoss(nn.Module): - def __init__(self, temperature): - super().__init__() - self.temperature = temperature - # self.kl_loss = nn.KLDivLoss(reduction = 'batchmean') - self.kl_loss = nn.KLDivLoss() - - def forward(self, student, teacher): - return self.kl_loss(F.log_softmax(student / self.temperature, dim=1), - F.softmax(teacher / self.temperature, dim=1)) - - -class CdartsTrainer(object): - """ - CDARTS trainer. - - Parameters - ---------- - model_small : nn.Module - PyTorch model to be trained. This is the search network of CDARTS. - model_large : nn.Module - PyTorch model to be trained. This is the evaluation network of CDARTS. - criterion : callable - Receives logits and ground truth label, return a loss tensor, e.g., ``nn.CrossEntropyLoss()``. - loaders : list of torch.utils.data.DataLoader - List of train data and valid data loaders, for training weights and architecture weights respectively. - samplers : list of torch.utils.data.Sampler - List of train data and valid data samplers. This can be PyTorch standard samplers if not distributed. - In distributed mode, sampler needs to have ``set_epoch`` method. Refer to data utils in CDARTS example for details. - logger : logging.Logger - The logger for logging. Will use nni logger by default (if logger is ``None``). - regular_coeff : float - The coefficient of regular loss. - regular_ratio : float - The ratio of regular loss. - warmup_epochs : int - The epochs to warmup the search network - fix_head : bool - ``True`` if fixing the paramters of auxiliary heads, else unfix the paramters of auxiliary heads. - epochs : int - Number of epochs planned for training. - steps_per_epoch : int - Steps of one epoch. - loss_alpha : float - The loss coefficient. - loss_T : float - The loss coefficient. - distributed : bool - ``True`` if using distributed training, else non-distributed training. - log_frequency : int - Step count per logging. - grad_clip : float - Gradient clipping for weights. - interactive_type : string - ``kl`` or ``smoothl1``. - output_path : string - Log storage path. - w_lr : float - Learning rate of the search network parameters. - w_momentum : float - Momentum of the search and the evaluation network. - w_weight_decay : float - The weight decay the search and the evaluation network parameters. - alpha_lr : float - Learning rate of the architecture parameters. - alpha_weight_decay : float - The weight decay the architecture parameters. - nasnet_lr : float - Learning rate of the evaluation network parameters. - local_rank : int - The number of thread. - share_module : bool - ``True`` if sharing the stem and auxiliary heads, else not sharing these modules. - """ - def __init__(self, model_small, model_large, criterion, loaders, samplers, logger=None, - regular_coeff=5, regular_ratio=0.2, warmup_epochs=2, fix_head=True, - epochs=32, steps_per_epoch=None, loss_alpha=2, loss_T=2, distributed=True, - log_frequency=10, grad_clip=5.0, interactive_type='kl', output_path='./outputs', - w_lr=0.2, w_momentum=0.9, w_weight_decay=3e-4, alpha_lr=0.2, alpha_weight_decay=1e-4, - nasnet_lr=0.2, local_rank=0, share_module=True): - if logger is None: - logger = logging.getLogger(__name__) - train_loader, valid_loader = loaders - train_sampler, valid_sampler = samplers - self.train_loader = CyclicIterator(train_loader, train_sampler, distributed) - self.valid_loader = CyclicIterator(valid_loader, valid_sampler, distributed) - - self.regular_coeff = regular_coeff - self.regular_ratio = regular_ratio - self.warmup_epochs = warmup_epochs - self.fix_head = fix_head - self.epochs = epochs - self.steps_per_epoch = steps_per_epoch - if self.steps_per_epoch is None: - self.steps_per_epoch = min(len(self.train_loader), len(self.valid_loader)) - self.loss_alpha = loss_alpha - self.grad_clip = grad_clip - if interactive_type == "kl": - self.interactive_loss = InteractiveKLLoss(loss_T) - elif interactive_type == "smoothl1": - self.interactive_loss = nn.SmoothL1Loss() - self.loss_T = loss_T - self.distributed = distributed - self.log_frequency = log_frequency - self.main_proc = not distributed or local_rank == 0 - - self.logger = logger - self.checkpoint_dir = output_path - if self.main_proc: - os.makedirs(self.checkpoint_dir, exist_ok=True) - if distributed: - torch.distributed.barrier() - - self.model_small = model_small - self.model_large = model_large - if self.fix_head: - for param in self.model_small.aux_head.parameters(): - param.requires_grad = False - for param in self.model_large.aux_head.parameters(): - param.requires_grad = False - - self.mutator_small = RegularizedDartsMutator(self.model_small).cuda() - self.mutator_large = DartsDiscreteMutator(self.model_large, self.mutator_small).cuda() - self.criterion = criterion - - self.optimizer_small = torch.optim.SGD(self.model_small.parameters(), w_lr, - momentum=w_momentum, weight_decay=w_weight_decay) - self.optimizer_large = torch.optim.SGD(self.model_large.parameters(), nasnet_lr, - momentum=w_momentum, weight_decay=w_weight_decay) - self.optimizer_alpha = torch.optim.Adam(self.mutator_small.parameters(), alpha_lr, - betas=(0.5, 0.999), weight_decay=alpha_weight_decay) - - if distributed: - apex.parallel.convert_syncbn_model(self.model_small) - apex.parallel.convert_syncbn_model(self.model_large) - self.model_small = DistributedDataParallel(self.model_small, delay_allreduce=True) - self.model_large = DistributedDataParallel(self.model_large, delay_allreduce=True) - self.mutator_small = RegularizedMutatorParallel(self.mutator_small, delay_allreduce=True) - if share_module: - self.model_small.callback_queued = True - self.model_large.callback_queued = True - # mutator large never gets optimized, so do not need parallelized - - def _warmup(self, phase, epoch): - assert phase in [PHASE_SMALL, PHASE_LARGE] - if phase == PHASE_SMALL: - model, optimizer = self.model_small, self.optimizer_small - elif phase == PHASE_LARGE: - model, optimizer = self.model_large, self.optimizer_large - model.train() - meters = AverageMeterGroup() - for step in range(self.steps_per_epoch): - x, y = next(self.train_loader) - x, y = x.cuda(), y.cuda() - - optimizer.zero_grad() - logits_main, _ = model(x) - loss = self.criterion(logits_main, y) - loss.backward() - - self._clip_grad_norm(model) - optimizer.step() - prec1, prec5 = accuracy(logits_main, y, topk=(1, 5)) - metrics = {"prec1": prec1, "prec5": prec5, "loss": loss} - metrics = reduce_metrics(metrics, self.distributed) - meters.update(metrics) - if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch): - self.logger.info("Epoch [%d/%d] Step [%d/%d] (%s) %s", epoch + 1, self.epochs, - step + 1, self.steps_per_epoch, phase, meters) - - def _clip_grad_norm(self, model): - if isinstance(model, DistributedDataParallel): - nn.utils.clip_grad_norm_(model.module.parameters(), self.grad_clip) - else: - nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip) - - def _reset_nan(self, parameters): - with torch.no_grad(): - for param in parameters: - for i, p in enumerate(param): - if p != p: # equivalent to `isnan(p)` - param[i] = float("-inf") - - def _joint_train(self, epoch): - self.model_large.train() - self.model_small.train() - meters = AverageMeterGroup() - for step in range(self.steps_per_epoch): - trn_x, trn_y = next(self.train_loader) - val_x, val_y = next(self.valid_loader) - trn_x, trn_y = trn_x.cuda(), trn_y.cuda() - val_x, val_y = val_x.cuda(), val_y.cuda() - - # step 1. optimize architecture - self.optimizer_alpha.zero_grad() - self.optimizer_large.zero_grad() - reg_decay = max(self.regular_coeff * (1 - float(epoch - self.warmup_epochs) / ( - (self.epochs - self.warmup_epochs) * self.regular_ratio)), 0) - loss_regular = self.mutator_small.reset_with_loss() - if loss_regular: - loss_regular *= reg_decay - logits_search, emsemble_logits_search = self.model_small(val_x) - logits_main, emsemble_logits_main = self.model_large(val_x) - loss_cls = (self.criterion(logits_search, val_y) + self.criterion(logits_main, val_y)) / self.loss_alpha - loss_interactive = self.interactive_loss(emsemble_logits_search, emsemble_logits_main) * (self.loss_T ** 2) * self.loss_alpha - loss = loss_cls + loss_interactive + loss_regular - loss.backward() - self._clip_grad_norm(self.model_large) - self.optimizer_large.step() - self.optimizer_alpha.step() - # NOTE: need to call here `self._reset_nan(self.mutator_small.parameters())` if `cut_choices` - - # step 2. optimize op weights - self.optimizer_small.zero_grad() - with torch.no_grad(): - # resample architecture since parameters have been changed - self.mutator_small.reset_with_loss() - logits_search_train, _ = self.model_small(trn_x) - loss_weight = self.criterion(logits_search_train, trn_y) - loss_weight.backward() - self._clip_grad_norm(self.model_small) - self.optimizer_small.step() - - metrics = {"loss_cls": loss_cls, "loss_interactive": loss_interactive, - "loss_regular": loss_regular, "loss_weight": loss_weight} - metrics = reduce_metrics(metrics, self.distributed) - meters.update(metrics) - - if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch): - self.logger.info("Epoch [%d/%d] Step [%d/%d] (joint) %s", epoch + 1, self.epochs, - step + 1, self.steps_per_epoch, meters) - - def train(self): - for epoch in range(self.epochs): - if epoch < self.warmup_epochs: - with torch.no_grad(): # otherwise grads will be retained on the architecture params - self.mutator_small.reset_with_loss() - self._warmup(PHASE_SMALL, epoch) - else: - with torch.no_grad(): - self.mutator_large.reset() - self._warmup(PHASE_LARGE, epoch) - self._joint_train(epoch) - - self.export(os.path.join(self.checkpoint_dir, "epoch_{:02d}.json".format(epoch)), - os.path.join(self.checkpoint_dir, "epoch_{:02d}.genotypes".format(epoch))) - - def export(self, file, genotype_file): - if self.main_proc: - mutator_export, genotypes = self.mutator_small.export(self.logger) - with open(file, "w") as f: - json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder) - with open(genotype_file, "w") as f: - f.write(str(genotypes)) diff --git a/nni/algorithms/nas/pytorch/cdarts/utils.py b/nni/algorithms/nas/pytorch/cdarts/utils.py deleted file mode 100644 index 96afa9425..000000000 --- a/nni/algorithms/nas/pytorch/cdarts/utils.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import os - -import torch -import torch.distributed as dist - - -class CyclicIterator: - def __init__(self, loader, sampler, distributed): - self.loader = loader - self.sampler = sampler - self.epoch = 0 - self.distributed = distributed - self._next_epoch() - - def _next_epoch(self): - if self.distributed: - self.sampler.set_epoch(self.epoch) - self.iterator = iter(self.loader) - self.epoch += 1 - - def __len__(self): - return len(self.loader) - - def __iter__(self): - return self - - def __next__(self): - try: - return next(self.iterator) - except StopIteration: - self._next_epoch() - return next(self.iterator) - - -class TorchTensorEncoder(json.JSONEncoder): - def default(self, o): # pylint: disable=method-hidden - if isinstance(o, torch.Tensor): - return o.tolist() - return super().default(o) - - -def accuracy(output, target, topk=(1,)): - """ Computes the precision@k for the specified values of k """ - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - # one-hot case - if target.ndimension() > 1: - target = target.max(1)[1] - - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0) - res.append(correct_k.mul_(1.0 / batch_size)) - return res - - -def reduce_tensor(tensor): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.ReduceOp.SUM) - rt /= float(os.environ["WORLD_SIZE"]) - return rt - - -def reduce_metrics(metrics, distributed=False): - if distributed: - return {k: reduce_tensor(v).item() for k, v in metrics.items()} - return {k: v.item() for k, v in metrics.items()} diff --git a/nni/algorithms/nas/pytorch/classic_nas/__init__.py b/nni/algorithms/nas/pytorch/classic_nas/__init__.py deleted file mode 100644 index ec3f5a489..000000000 --- a/nni/algorithms/nas/pytorch/classic_nas/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import get_and_apply_next_architecture diff --git a/nni/algorithms/nas/pytorch/classic_nas/mutator.py b/nni/algorithms/nas/pytorch/classic_nas/mutator.py deleted file mode 100644 index 7254a8b0b..000000000 --- a/nni/algorithms/nas/pytorch/classic_nas/mutator.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging -import os -import sys - -import torch - -import nni -from nni.runtime.env_vars import trial_env_vars -from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope -from nni.nas.pytorch.mutator import Mutator - -logger = logging.getLogger(__name__) - -NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE" -LAYER_CHOICE = "layer_choice" -INPUT_CHOICE = "input_choice" - - -def get_and_apply_next_architecture(model): - """ - Wrapper of :class:`~nni.nas.pytorch.classic_nas.mutator.ClassicMutator` to make it more meaningful, - similar to ``get_next_parameter`` for HPO. - - It will generate search space based on ``model``. - If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for - generating search space for the experiment. - If not, there are still two mode, one is nni experiment mode where users - use ``nnictl`` to start an experiment. The other is standalone mode - where users directly run the trial command, this mode chooses the first - one(s) for each LayerChoice and InputChoice. - - Parameters - ---------- - model : nn.Module - User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. - """ - ClassicMutator(model) - - -class ClassicMutator(Mutator): - """ - This mutator is to apply the architecture chosen from tuner. - It implements the forward function of LayerChoice and InputChoice, - to only activate the chosen ones. - - Parameters - ---------- - model : nn.Module - User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. - """ - - def __init__(self, model): - super(ClassicMutator, self).__init__(model) - self._chosen_arch = {} - self._search_space = self._generate_search_space() - if NNI_GEN_SEARCH_SPACE in os.environ: - # dry run for only generating search space - self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE]) - sys.exit(0) - - if trial_env_vars.NNI_PLATFORM is None: - logger.warning("This is in standalone mode, the chosen are the first one(s).") - self._chosen_arch = self._standalone_generate_chosen() - else: - # get chosen arch from tuner - self._chosen_arch = nni.get_next_parameter() - if self._chosen_arch is None: - if trial_env_vars.NNI_PLATFORM == "unittest": - # happens if NNI_PLATFORM is intentionally set, e.g., in UT - logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.") - self._chosen_arch = self._standalone_generate_chosen() - else: - raise RuntimeError("Chosen architecture is None. This may be a platform error.") - self.reset() - - def _sample_layer_choice(self, mutable, idx, value, search_space_item): - """ - Convert layer choice to tensor representation. - - Parameters - ---------- - mutable : Mutable - idx : int - Number `idx` of list will be selected. - value : str - The verbose representation of the selected value. - search_space_item : list - The list for corresponding search space. - """ - # doesn't support multihot for layer choice yet - onehot_list = [False] * len(mutable) - assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \ - "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value) - onehot_list[idx] = True - return torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable - - def _sample_input_choice(self, mutable, idx, value, search_space_item): - """ - Convert input choice to tensor representation. - - Parameters - ---------- - mutable : Mutable - idx : int - Number `idx` of list will be selected. - value : str - The verbose representation of the selected value. - search_space_item : list - The list for corresponding search space. - """ - candidate_repr = search_space_item["candidates"] - multihot_list = [False] * mutable.n_candidates - for i, v in zip(idx, value): - assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \ - "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v) - assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx) - multihot_list[i] = True - return torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable - - def sample_search(self): - """ - See :meth:`sample_final`. - """ - return self.sample_final() - - def sample_final(self): - """ - Convert the chosen arch and apply it on model. - """ - assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \ - "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(), - self._chosen_arch.keys()) - result = dict() - for mutable in self.mutables: - if isinstance(mutable, (LayerChoice, InputChoice)): - assert mutable.key in self._chosen_arch, \ - "Expected '{}' in chosen arch, but not found.".format(mutable.key) - data = self._chosen_arch[mutable.key] - assert isinstance(data, dict) and "_value" in data and "_idx" in data, \ - "'{}' is not a valid choice.".format(data) - if isinstance(mutable, LayerChoice): - result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"], - self._search_space[mutable.key]["_value"]) - elif isinstance(mutable, InputChoice): - result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"], - self._search_space[mutable.key]["_value"]) - elif isinstance(mutable, MutableScope): - logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key) - else: - raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) - return result - - def _standalone_generate_chosen(self): - """ - Generate the chosen architecture for standalone mode, - i.e., choose the first one(s) for LayerChoice and InputChoice. - :: - { key_name: {"_value": "conv1", - "_idx": 0} } - { key_name: {"_value": ["in1"], - "_idx": [0]} } - Returns - ------- - dict - the chosen architecture - """ - chosen_arch = {} - for key, val in self._search_space.items(): - if val["_type"] == LAYER_CHOICE: - choices = val["_value"] - chosen_arch[key] = {"_value": choices[0], "_idx": 0} - elif val["_type"] == INPUT_CHOICE: - choices = val["_value"]["candidates"] - n_chosen = val["_value"]["n_chosen"] - if n_chosen is None: - n_chosen = len(choices) - chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))} - else: - raise ValueError("Unknown key '%s' and value '%s'." % (key, val)) - return chosen_arch - - def _generate_search_space(self): - """ - Generate search space from mutables. - Here is the search space format: - :: - { key_name: {"_type": "layer_choice", - "_value": ["conv1", "conv2"]} } - { key_name: {"_type": "input_choice", - "_value": {"candidates": ["in1", "in2"], - "n_chosen": 1}} } - Returns - ------- - dict - the generated search space - """ - search_space = {} - for mutable in self.mutables: - # for now we only generate flattened search space - if isinstance(mutable, LayerChoice): - key = mutable.key - val = mutable.names - search_space[key] = {"_type": LAYER_CHOICE, "_value": val} - elif isinstance(mutable, InputChoice): - key = mutable.key - search_space[key] = {"_type": INPUT_CHOICE, - "_value": {"candidates": mutable.choose_from, - "n_chosen": mutable.n_chosen}} - elif isinstance(mutable, MutableScope): - logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key) - else: - raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) - return search_space - - def _dump_search_space(self, file_path): - with open(file_path, "w") as ss_file: - json.dump(self._search_space, ss_file, sort_keys=True, indent=2) diff --git a/nni/algorithms/nas/pytorch/cream/__init__.py b/nni/algorithms/nas/pytorch/cream/__init__.py deleted file mode 100755 index 43a038b46..000000000 --- a/nni/algorithms/nas/pytorch/cream/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .trainer import CreamSupernetTrainer diff --git a/nni/algorithms/nas/pytorch/cream/trainer.py b/nni/algorithms/nas/pytorch/cream/trainer.py deleted file mode 100644 index b44f40466..000000000 --- a/nni/algorithms/nas/pytorch/cream/trainer.py +++ /dev/null @@ -1,403 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -from copy import deepcopy - -import torch -from nni.nas.pytorch.trainer import Trainer -from nni.nas.pytorch.utils import AverageMeterGroup - -from .utils import accuracy, reduce_metrics - -logger = logging.getLogger(__name__) - - -class CreamSupernetTrainer(Trainer): - """ - This trainer trains a supernet and output prioritized architectures that can be used for other tasks. - - Parameters - ---------- - model : nn.Module - Model with mutables. - loss : callable - Called with logits and targets. Returns a loss tensor. - val_loss : callable - Called with logits and targets for validation only. Returns a loss tensor. - optimizer : Optimizer - Optimizer that optimizes the model. - num_epochs : int - Number of epochs of training. - train_loader : iterablez - Data loader of training. Raise ``StopIteration`` when one epoch is exhausted. - valid_loader : iterablez - Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted. - mutator : Mutator - A mutator object that has been initialized with the model. - batch_size : int - Batch size. - log_frequency : int - Number of mini-batches to log metrics. - meta_sta_epoch : int - start epoch of using meta matching network to pick teacher architecture - update_iter : int - interval of updating meta matching networks - slices : int - batch size of mini training data in the process of training meta matching network - pool_size : int - board size - pick_method : basestring - how to pick teacher network - choice_num : int - number of operations in supernet - sta_num : int - layer number of each stage in supernet (5 stage in supernet) - acc_gap : int - maximum accuracy improvement to omit the limitation of flops - flops_dict : Dict - dictionary of each layer's operations in supernet - flops_fixed : int - flops of fixed part in supernet - local_rank : int - index of current rank - callbacks : list of Callback - Callbacks to plug into the trainer. See Callbacks. - """ - - def __init__(self, model, loss, val_loss, - optimizer, num_epochs, train_loader, valid_loader, - mutator=None, batch_size=64, log_frequency=None, - meta_sta_epoch=20, update_iter=200, slices=2, - pool_size=10, pick_method='meta', choice_num=6, - sta_num=(4, 4, 4, 4, 4), acc_gap=5, - flops_dict=None, flops_fixed=0, local_rank=0, callbacks=None): - assert torch.cuda.is_available() - super(CreamSupernetTrainer, self).__init__(model, mutator, loss, None, - optimizer, num_epochs, None, None, - batch_size, None, None, log_frequency, callbacks) - self.model = model - self.loss = loss - self.val_loss = val_loss - self.train_loader = train_loader - self.valid_loader = valid_loader - self.log_frequency = log_frequency - self.batch_size = batch_size - self.optimizer = optimizer - self.model = model - self.loss = loss - self.num_epochs = num_epochs - self.meta_sta_epoch = meta_sta_epoch - self.update_iter = update_iter - self.slices = slices - self.pick_method = pick_method - self.pool_size = pool_size - self.local_rank = local_rank - self.choice_num = choice_num - self.sta_num = sta_num - self.acc_gap = acc_gap - self.flops_dict = flops_dict - self.flops_fixed = flops_fixed - - self.current_student_arch = None - self.current_teacher_arch = None - self.main_proc = (local_rank == 0) - self.current_epoch = 0 - - self.prioritized_board = [] - - # size of prioritized board - def _board_size(self): - return len(self.prioritized_board) - - # select teacher architecture according to the logit difference - def _select_teacher(self): - self._replace_mutator_cand(self.current_student_arch) - - if self.pick_method == 'top1': - meta_value, teacher_cand = 0.5, sorted( - self.prioritized_board, reverse=True)[0][3] - elif self.pick_method == 'meta': - meta_value, cand_idx, teacher_cand = -1000000000, -1, None - for now_idx, item in enumerate(self.prioritized_board): - inputx = item[4] - output = torch.nn.functional.softmax(self.model(inputx), dim=1) - weight = self.model.module.forward_meta(output - item[5]) - if weight > meta_value: - meta_value = weight - cand_idx = now_idx - teacher_cand = self.prioritized_board[cand_idx][3] - assert teacher_cand is not None - meta_value = torch.nn.functional.sigmoid(-weight) - else: - raise ValueError('Method Not supported') - - return meta_value, teacher_cand - - # check whether to update prioritized board - def _isUpdateBoard(self, prec1, flops): - if self.current_epoch <= self.meta_sta_epoch: - return False - - if len(self.prioritized_board) < self.pool_size: - return True - - if prec1 > self.prioritized_board[-1][1] + self.acc_gap: - return True - - if prec1 > self.prioritized_board[-1][1] and flops < self.prioritized_board[-1][2]: - return True - - return False - - # update prioritized board - def _update_prioritized_board(self, inputs, teacher_output, outputs, prec1, flops): - if self._isUpdateBoard(prec1, flops): - val_prec1 = prec1 - training_data = deepcopy(inputs[:self.slices].detach()) - if len(self.prioritized_board) == 0: - features = deepcopy(outputs[:self.slices].detach()) - else: - features = deepcopy( - teacher_output[:self.slices].detach()) - self.prioritized_board.append( - (val_prec1, - prec1, - flops, - self.current_student_arch, - training_data, - torch.nn.functional.softmax( - features, - dim=1))) - self.prioritized_board = sorted( - self.prioritized_board, reverse=True) - - if len(self.prioritized_board) > self.pool_size: - del self.prioritized_board[-1] - - # only update student network weights - def _update_student_weights_only(self, grad_1): - for weight, grad_item in zip( - self.model.module.rand_parameters(self.current_student_arch), grad_1): - weight.grad = grad_item - torch.nn.utils.clip_grad_norm_( - self.model.module.rand_parameters(self.current_student_arch), 1) - self.optimizer.step() - for weight, grad_item in zip( - self.model.module.rand_parameters(self.current_student_arch), grad_1): - del weight.grad - - # only update meta networks weights - def _update_meta_weights_only(self, teacher_cand, grad_teacher): - for weight, grad_item in zip(self.model.module.rand_parameters( - teacher_cand, self.pick_method == 'meta'), grad_teacher): - weight.grad = grad_item - - # clip gradients - torch.nn.utils.clip_grad_norm_( - self.model.module.rand_parameters( - self.current_student_arch, self.pick_method == 'meta'), 1) - - self.optimizer.step() - for weight, grad_item in zip(self.model.module.rand_parameters( - teacher_cand, self.pick_method == 'meta'), grad_teacher): - del weight.grad - - # simulate sgd updating - def _simulate_sgd_update(self, w, g, optimizer): - return g * optimizer.param_groups[-1]['lr'] + w - - # split training images into several slices - def _get_minibatch_input(self, input): # pylint: disable=redefined-builtin - slice = self.slices # pylint: disable=redefined-builtin - x = deepcopy(input[:slice].clone().detach()) - return x - - # calculate 1st gradient of student architectures - def _calculate_1st_gradient(self, kd_loss): - self.optimizer.zero_grad() - grad = torch.autograd.grad( - kd_loss, - self.model.module.rand_parameters(self.current_student_arch), - create_graph=True) - return grad - - # calculate 2nd gradient of meta networks - def _calculate_2nd_gradient(self, validation_loss, teacher_cand, students_weight): - self.optimizer.zero_grad() - grad_student_val = torch.autograd.grad( - validation_loss, - self.model.module.rand_parameters(self.current_student_arch), - retain_graph=True) - - grad_teacher = torch.autograd.grad( - students_weight[0], - self.model.module.rand_parameters( - teacher_cand, - self.pick_method == 'meta'), - grad_outputs=grad_student_val) - return grad_teacher - - # forward training data - def _forward_training(self, x, meta_value): - self._replace_mutator_cand(self.current_student_arch) - output = self.model(x) - - with torch.no_grad(): - self._replace_mutator_cand(self.current_teacher_arch) - teacher_output = self.model(x) - soft_label = torch.nn.functional.softmax(teacher_output, dim=1) - - kd_loss = meta_value * \ - self._cross_entropy_loss_with_soft_target(output, soft_label) - return kd_loss - - # calculate soft target loss - def _cross_entropy_loss_with_soft_target(self, pred, soft_target): - logsoftmax = torch.nn.LogSoftmax() - return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1)) - - # forward validation data - def _forward_validation(self, input, target): # pylint: disable=redefined-builtin - slice = self.slices # pylint: disable=redefined-builtin - x = input[slice:slice * 2].clone() - - self._replace_mutator_cand(self.current_student_arch) - output_2 = self.model(x) - - validation_loss = self.loss(output_2, target[slice:slice * 2]) - return validation_loss - - def _isUpdateMeta(self, batch_idx): - isUpdate = True - isUpdate &= (self.current_epoch > self.meta_sta_epoch) - isUpdate &= (batch_idx > 0) - isUpdate &= (batch_idx % self.update_iter == 0) - isUpdate &= (self._board_size() > 0) - return isUpdate - - def _replace_mutator_cand(self, cand): - self.mutator._cache = cand - - # update meta matching networks - def _run_update(self, input, target, batch_idx): # pylint: disable=redefined-builtin - if self._isUpdateMeta(batch_idx): - x = self._get_minibatch_input(input) - - meta_value, teacher_cand = self._select_teacher() - - kd_loss = self._forward_training(x, meta_value) - - # calculate 1st gradient - grad_1st = self._calculate_1st_gradient(kd_loss) - - # simulate updated student weights - students_weight = [ - self._simulate_sgd_update( - p, grad_item, self.optimizer) for p, grad_item in zip( - self.model.module.rand_parameters(self.current_student_arch), grad_1st)] - - # update student weights - self._update_student_weights_only(grad_1st) - - validation_loss = self._forward_validation(input, target) - - # calculate 2nd gradient - grad_teacher = self._calculate_2nd_gradient(validation_loss, teacher_cand, students_weight) - - # update meta matching networks - self._update_meta_weights_only(teacher_cand, grad_teacher) - - # delete internal variants - del grad_teacher, grad_1st, x, validation_loss, kd_loss, students_weight - - def _get_cand_flops(self, cand): - flops = 0 - for block_id, block in enumerate(cand): - if block == 'LayerChoice1' or block_id == 'LayerChoice23': - continue - for idx, choice in enumerate(cand[block]): - flops += self.flops_dict[block_id][idx] * (1 if choice else 0) - return flops + self.flops_fixed - - def train_one_epoch(self, epoch): - self.current_epoch = epoch - meters = AverageMeterGroup() - self.steps_per_epoch = len(self.train_loader) - for step, (input_data, target) in enumerate(self.train_loader): - self.mutator.reset() - self.current_student_arch = self.mutator._cache - - input_data, target = input_data.cuda(), target.cuda() - - # calculate flops of current architecture - cand_flops = self._get_cand_flops(self.mutator._cache) - - # update meta matching network - self._run_update(input_data, target, step) - - if self._board_size() > 0: - # select teacher architecture - meta_value, teacher_cand = self._select_teacher() - self.current_teacher_arch = teacher_cand - - # forward supernet - if self._board_size() == 0 or epoch <= self.meta_sta_epoch: - self._replace_mutator_cand(self.current_student_arch) - output = self.model(input_data) - - loss = self.loss(output, target) - kd_loss, teacher_output, teacher_cand = None, None, None - else: - self._replace_mutator_cand(self.current_student_arch) - output = self.model(input_data) - - gt_loss = self.loss(output, target) - - with torch.no_grad(): - self._replace_mutator_cand(self.current_teacher_arch) - teacher_output = self.model(input_data).detach() - - soft_label = torch.nn.functional.softmax(teacher_output, dim=1) - kd_loss = self._cross_entropy_loss_with_soft_target(output, soft_label) - - loss = (meta_value * kd_loss + (2 - meta_value) * gt_loss) / 2 - - # update network - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - - # update metrics - prec1, prec5 = accuracy(output, target, topk=(1, 5)) - metrics = {"prec1": prec1, "prec5": prec5, "loss": loss} - metrics = reduce_metrics(metrics) - meters.update(metrics) - - # update prioritized board - self._update_prioritized_board(input_data, teacher_output, output, metrics['prec1'], cand_flops) - - if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch): - logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, self.num_epochs, - step + 1, len(self.train_loader), meters) - - if self.main_proc and self.num_epochs == epoch + 1: - for idx, i in enumerate(self.prioritized_board): - logger.info("No.%s %s", idx, i[:4]) - - def validate_one_epoch(self, epoch): - self.model.eval() - meters = AverageMeterGroup() - with torch.no_grad(): - for step, (x, y) in enumerate(self.valid_loader): - self.mutator.reset() - logits = self.model(x) - loss = self.val_loss(logits, y) - prec1, prec5 = accuracy(logits, y, topk=(1, 5)) - metrics = {"prec1": prec1, "prec5": prec5, "loss": loss} - metrics = reduce_metrics(metrics) - meters.update(metrics) - - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, - self.num_epochs, step + 1, len(self.valid_loader), meters) diff --git a/nni/algorithms/nas/pytorch/cream/utils.py b/nni/algorithms/nas/pytorch/cream/utils.py deleted file mode 100644 index 7d71faa71..000000000 --- a/nni/algorithms/nas/pytorch/cream/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - - -import os -import torch.distributed as dist - - -def accuracy(output, target, topk=(1,)): - """ Computes the precision@k for the specified values of k """ - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - # one-hot case - if target.ndimension() > 1: - target = target.max(1)[1] - - correct = pred.eq(target.reshape(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].view(-1).float().sum(0) - res.append(correct_k.mul_(1.0 / batch_size)) - return res - - -def reduce_metrics(metrics): - return {k: reduce_tensor(v).item() for k, v in metrics.items()} - - -def reduce_tensor(tensor): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.ReduceOp.SUM) - rt /= float(os.environ["WORLD_SIZE"]) - return rt diff --git a/nni/algorithms/nas/pytorch/darts/__init__.py b/nni/algorithms/nas/pytorch/darts/__init__.py deleted file mode 100644 index 1a22790fb..000000000 --- a/nni/algorithms/nas/pytorch/darts/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import DartsMutator -from .trainer import DartsTrainer diff --git a/nni/algorithms/nas/pytorch/darts/mutator.py b/nni/algorithms/nas/pytorch/darts/mutator.py deleted file mode 100644 index a4c3898a9..000000000 --- a/nni/algorithms/nas/pytorch/darts/mutator.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from nni.nas.pytorch.mutator import Mutator -from nni.nas.pytorch.mutables import LayerChoice, InputChoice - -_logger = logging.getLogger(__name__) - - -class DartsMutator(Mutator): - """ - Connects the model in a DARTS (differentiable) way. - - An extra connection is automatically inserted for each LayerChoice, when this connection is selected, there is no - op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false`` - (not chosen). - - All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based - on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice - will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0. - - It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the - value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will - be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully. - - Attributes - ---------- - choices: ParameterDict - dict that maps keys of LayerChoices to weighted-connection float tensors. - """ - def __init__(self, model): - super().__init__(model) - self.choices = nn.ParameterDict() - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1)) - - def device(self): - for v in self.choices.values(): - return v.device - - def sample_search(self): - result = dict() - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1] - elif isinstance(mutable, InputChoice): - result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) - return result - - def sample_final(self): - result = dict() - edges_max = dict() - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - max_val, index = torch.max(F.softmax(self.choices[mutable.key], dim=-1)[:-1], 0) - edges_max[mutable.key] = max_val - result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool() - for mutable in self.mutables: - if isinstance(mutable, InputChoice): - if mutable.n_chosen is not None: - weights = [] - for src_key in mutable.choose_from: - if src_key not in edges_max: - _logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key) - weights.append(edges_max.get(src_key, 0.)) - weights = torch.tensor(weights) # pylint: disable=not-callable - _, topk_edge_indices = torch.topk(weights, mutable.n_chosen) - selected_multihot = [] - for i, src_key in enumerate(mutable.choose_from): - if i not in topk_edge_indices and src_key in result: - # If an edge is never selected, there is no need to calculate any op on this edge. - # This is to eliminate redundant calculation. - result[src_key] = torch.zeros_like(result[src_key]) - selected_multihot.append(i in topk_edge_indices) - result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable - else: - result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable - return result diff --git a/nni/algorithms/nas/pytorch/darts/trainer.py b/nni/algorithms/nas/pytorch/darts/trainer.py deleted file mode 100644 index e2d8e1866..000000000 --- a/nni/algorithms/nas/pytorch/darts/trainer.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import copy -import logging - -import torch -import torch.nn as nn -from nni.nas.pytorch.trainer import Trainer -from nni.nas.pytorch.utils import AverageMeterGroup - -from .mutator import DartsMutator - -logger = logging.getLogger(__name__) - - -class DartsTrainer(Trainer): - """ - DARTS trainer. - - Parameters - ---------- - model : nn.Module - PyTorch model to be trained. - loss : callable - Receives logits and ground truth label, return a loss tensor. - metrics : callable - Receives logits and ground truth label, return a dict of metrics. - optimizer : Optimizer - The optimizer used for optimizing the model. - num_epochs : int - Number of epochs planned for training. - dataset_train : Dataset - Dataset for training. Will be split for training weights and architecture weights. - dataset_valid : Dataset - Dataset for testing. - mutator : DartsMutator - Use in case of customizing your own DartsMutator. By default will instantiate a DartsMutator. - batch_size : int - Batch size. - workers : int - Workers for data loading. - device : torch.device - ``torch.device("cpu")`` or ``torch.device("cuda")``. - log_frequency : int - Step count per logging. - callbacks : list of Callback - list of callbacks to trigger at events. - arc_learning_rate : float - Learning rate of architecture parameters. - unrolled : float - ``True`` if using second order optimization, else first order optimization. - """ - def __init__(self, model, loss, metrics, - optimizer, num_epochs, dataset_train, dataset_valid, - mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, - callbacks=None, arc_learning_rate=3.0E-4, unrolled=False): - super().__init__(model, mutator if mutator is not None else DartsMutator(model), - loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, - batch_size, workers, device, log_frequency, callbacks) - - self.ctrl_optim = torch.optim.Adam(self.mutator.parameters(), arc_learning_rate, betas=(0.5, 0.999), - weight_decay=1.0E-3) - self.unrolled = unrolled - - n_train = len(self.dataset_train) - split = n_train // 2 - indices = list(range(n_train)) - train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split]) - valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:]) - self.train_loader = torch.utils.data.DataLoader(self.dataset_train, - batch_size=batch_size, - sampler=train_sampler, - num_workers=workers) - self.valid_loader = torch.utils.data.DataLoader(self.dataset_train, - batch_size=batch_size, - sampler=valid_sampler, - num_workers=workers) - self.test_loader = torch.utils.data.DataLoader(self.dataset_valid, - batch_size=batch_size, - num_workers=workers) - - def train_one_epoch(self, epoch): - self.model.train() - self.mutator.train() - meters = AverageMeterGroup() - for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)): - trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device) - val_X, val_y = val_X.to(self.device), val_y.to(self.device) - - # phase 1. architecture step - self.ctrl_optim.zero_grad() - if self.unrolled: - self._unrolled_backward(trn_X, trn_y, val_X, val_y) - else: - self._backward(val_X, val_y) - self.ctrl_optim.step() - - # phase 2: child network step - self.optimizer.zero_grad() - logits, loss = self._logits_and_loss(trn_X, trn_y) - loss.backward() - nn.utils.clip_grad_norm_(self.model.parameters(), 5.) # gradient clipping - self.optimizer.step() - - metrics = self.metrics(logits, trn_y) - metrics["loss"] = loss.item() - meters.update(metrics) - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, - self.num_epochs, step + 1, len(self.train_loader), meters) - - def validate_one_epoch(self, epoch): - self.model.eval() - self.mutator.eval() - meters = AverageMeterGroup() - with torch.no_grad(): - self.mutator.reset() - for step, (X, y) in enumerate(self.test_loader): - X, y = X.to(self.device), y.to(self.device) - logits = self.model(X) - metrics = self.metrics(logits, y) - meters.update(metrics) - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, - self.num_epochs, step + 1, len(self.test_loader), meters) - - def _logits_and_loss(self, X, y): - self.mutator.reset() - logits = self.model(X) - loss = self.loss(logits, y) - self._write_graph_status() - return logits, loss - - def _backward(self, val_X, val_y): - """ - Simple backward with gradient descent - """ - _, loss = self._logits_and_loss(val_X, val_y) - loss.backward() - - def _unrolled_backward(self, trn_X, trn_y, val_X, val_y): - """ - Compute unrolled loss and backward its gradients - """ - backup_params = copy.deepcopy(tuple(self.model.parameters())) - - # do virtual step on training data - lr = self.optimizer.param_groups[0]["lr"] - momentum = self.optimizer.param_groups[0]["momentum"] - weight_decay = self.optimizer.param_groups[0]["weight_decay"] - self._compute_virtual_model(trn_X, trn_y, lr, momentum, weight_decay) - - # calculate unrolled loss on validation data - # keep gradients for model here for compute hessian - _, loss = self._logits_and_loss(val_X, val_y) - w_model, w_ctrl = tuple(self.model.parameters()), tuple(self.mutator.parameters()) - w_grads = torch.autograd.grad(loss, w_model + w_ctrl) - d_model, d_ctrl = w_grads[:len(w_model)], w_grads[len(w_model):] - - # compute hessian and final gradients - hessian = self._compute_hessian(backup_params, d_model, trn_X, trn_y) - with torch.no_grad(): - for param, d, h in zip(w_ctrl, d_ctrl, hessian): - # gradient = dalpha - lr * hessian - param.grad = d - lr * h - - # restore weights - self._restore_weights(backup_params) - - def _compute_virtual_model(self, X, y, lr, momentum, weight_decay): - """ - Compute unrolled weights w` - """ - # don't need zero_grad, using autograd to calculate gradients - _, loss = self._logits_and_loss(X, y) - gradients = torch.autograd.grad(loss, self.model.parameters()) - with torch.no_grad(): - for w, g in zip(self.model.parameters(), gradients): - m = self.optimizer.state[w].get("momentum_buffer", 0.) - w = w - lr * (momentum * m + g + weight_decay * w) - - def _restore_weights(self, backup_params): - with torch.no_grad(): - for param, backup in zip(self.model.parameters(), backup_params): - param.copy_(backup) - - def _compute_hessian(self, backup_params, dw, trn_X, trn_y): - """ - dw = dw` { L_val(w`, alpha) } - w+ = w + eps * dw - w- = w - eps * dw - hessian = (dalpha { L_trn(w+, alpha) } - dalpha { L_trn(w-, alpha) }) / (2*eps) - eps = 0.01 / ||dw|| - """ - self._restore_weights(backup_params) - norm = torch.cat([w.view(-1) for w in dw]).norm() - eps = 0.01 / norm - if norm < 1E-8: - logger.warning("In computing hessian, norm is smaller than 1E-8, cause eps to be %.6f.", norm.item()) - - dalphas = [] - for e in [eps, -2. * eps]: - # w+ = w + eps*dw`, w- = w - eps*dw` - with torch.no_grad(): - for p, d in zip(self.model.parameters(), dw): - p += e * d - - _, loss = self._logits_and_loss(trn_X, trn_y) - dalphas.append(torch.autograd.grad(loss, self.mutator.parameters())) - - dalpha_pos, dalpha_neg = dalphas # dalpha { L_trn(w+) }, # dalpha { L_trn(w-) } - hessian = [(p - n) / (2. * eps) for p, n in zip(dalpha_pos, dalpha_neg)] - return hessian diff --git a/nni/algorithms/nas/pytorch/enas/__init__.py b/nni/algorithms/nas/pytorch/enas/__init__.py deleted file mode 100644 index d3372836e..000000000 --- a/nni/algorithms/nas/pytorch/enas/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import EnasMutator -from .trainer import EnasTrainer diff --git a/nni/algorithms/nas/pytorch/enas/mutator.py b/nni/algorithms/nas/pytorch/enas/mutator.py deleted file mode 100644 index 7fdba26b9..000000000 --- a/nni/algorithms/nas/pytorch/enas/mutator.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from nni.nas.pytorch.mutator import Mutator -from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope - - -class StackedLSTMCell(nn.Module): - def __init__(self, layers, size, bias): - super().__init__() - self.lstm_num_layers = layers - self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias) - for _ in range(self.lstm_num_layers)]) - - def forward(self, inputs, hidden): - prev_h, prev_c = hidden - next_h, next_c = [], [] - for i, m in enumerate(self.lstm_modules): - curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i])) - next_c.append(curr_c) - next_h.append(curr_h) - # current implementation only supports batch size equals 1, - # but the algorithm does not necessarily have this limitation - inputs = curr_h[-1].view(1, -1) - return next_h, next_c - - -class EnasMutator(Mutator): - """ - A mutator that mutates the graph with RL. - - Parameters - ---------- - model : nn.Module - PyTorch model. - lstm_size : int - Controller LSTM hidden units. - lstm_num_layers : int - Number of layers for stacked LSTM. - tanh_constant : float - Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``. - cell_exit_extra_step : bool - If true, RL controller will perform an extra step at the exit of each MutableScope, dump the hidden state - and mark it as the hidden state of this MutableScope. This is to align with the original implementation of paper. - skip_target : float - Target probability that skipconnect will appear. - temperature : float - Temperature constant that divides the logits. - branch_bias : float - Manual bias applied to make some operations more likely to be chosen. - Currently this is implemented with a hardcoded match rule that aligns with original repo. - If a mutable has a ``reduce`` in its key, all its op choices - that contains `conv` in their typename will receive a bias of ``+self.branch_bias`` initially; while others - receive a bias of ``-self.branch_bias``. - entropy_reduction : str - Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced. - """ - - def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, cell_exit_extra_step=False, - skip_target=0.4, temperature=None, branch_bias=0.25, entropy_reduction="sum"): - super().__init__(model) - self.lstm_size = lstm_size - self.lstm_num_layers = lstm_num_layers - self.tanh_constant = tanh_constant - self.temperature = temperature - self.cell_exit_extra_step = cell_exit_extra_step - self.skip_target = skip_target - self.branch_bias = branch_bias - - self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False) - self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False) - self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False) - self.v_attn = nn.Linear(self.lstm_size, 1, bias=False) - self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1) - self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), requires_grad=False) # pylint: disable=not-callable - assert entropy_reduction in ["sum", "mean"], "Entropy reduction must be one of sum and mean." - self.entropy_reduction = torch.sum if entropy_reduction == "sum" else torch.mean - self.cross_entropy_loss = nn.CrossEntropyLoss(reduction="none") - self.bias_dict = nn.ParameterDict() - - self.max_layer_choice = 0 - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - if self.max_layer_choice == 0: - self.max_layer_choice = len(mutable) - assert self.max_layer_choice == len(mutable), \ - "ENAS mutator requires all layer choice have the same number of candidates." - # We are judging by keys and module types to add biases to layer choices. Needs refactor. - if "reduce" in mutable.key: - def is_conv(choice): - return "conv" in str(type(choice)).lower() - bias = torch.tensor([self.branch_bias if is_conv(choice) else -self.branch_bias # pylint: disable=not-callable - for choice in mutable]) - self.bias_dict[mutable.key] = nn.Parameter(bias, requires_grad=False) - - self.embedding = nn.Embedding(self.max_layer_choice + 1, self.lstm_size) - self.soft = nn.Linear(self.lstm_size, self.max_layer_choice, bias=False) - - def sample_search(self): - self._initialize() - self._sample(self.mutables) - return self._choices - - def sample_final(self): - return self.sample_search() - - def _sample(self, tree): - mutable = tree.mutable - if isinstance(mutable, LayerChoice) and mutable.key not in self._choices: - self._choices[mutable.key] = self._sample_layer_choice(mutable) - elif isinstance(mutable, InputChoice) and mutable.key not in self._choices: - self._choices[mutable.key] = self._sample_input_choice(mutable) - for child in tree.children: - self._sample(child) - if isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid: - if self.cell_exit_extra_step: - self._lstm_next_step() - self._mark_anchor(mutable.key) - - def _initialize(self): - self._choices = dict() - self._anchors_hid = dict() - self._inputs = self.g_emb.data - self._c = [torch.zeros((1, self.lstm_size), - dtype=self._inputs.dtype, - device=self._inputs.device) for _ in range(self.lstm_num_layers)] - self._h = [torch.zeros((1, self.lstm_size), - dtype=self._inputs.dtype, - device=self._inputs.device) for _ in range(self.lstm_num_layers)] - self.sample_log_prob = 0 - self.sample_entropy = 0 - self.sample_skip_penalty = 0 - - def _lstm_next_step(self): - self._h, self._c = self.lstm(self._inputs, (self._h, self._c)) - - def _mark_anchor(self, key): - self._anchors_hid[key] = self._h[-1] - - def _sample_layer_choice(self, mutable): - self._lstm_next_step() - logit = self.soft(self._h[-1]) - if self.temperature is not None: - logit /= self.temperature - if self.tanh_constant is not None: - logit = self.tanh_constant * torch.tanh(logit) - if mutable.key in self.bias_dict: - logit += self.bias_dict[mutable.key] - branch_id = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) - log_prob = self.cross_entropy_loss(logit, branch_id) - self.sample_log_prob += self.entropy_reduction(log_prob) - entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type - self.sample_entropy += self.entropy_reduction(entropy) - self._inputs = self.embedding(branch_id) - return F.one_hot(branch_id, num_classes=self.max_layer_choice).bool().view(-1) - - def _sample_input_choice(self, mutable): - query, anchors = [], [] - for label in mutable.choose_from: - if label not in self._anchors_hid: - self._lstm_next_step() - self._mark_anchor(label) # empty loop, fill not found - query.append(self.attn_anchor(self._anchors_hid[label])) - anchors.append(self._anchors_hid[label]) - query = torch.cat(query, 0) - query = torch.tanh(query + self.attn_query(self._h[-1])) - query = self.v_attn(query) - if self.temperature is not None: - query /= self.temperature - if self.tanh_constant is not None: - query = self.tanh_constant * torch.tanh(query) - - if mutable.n_chosen is None: - logit = torch.cat([-query, query], 1) # pylint: disable=invalid-unary-operand-type - - skip = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) - skip_prob = torch.sigmoid(logit) - kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets)) - self.sample_skip_penalty += kl - log_prob = self.cross_entropy_loss(logit, skip) - self._inputs = (torch.matmul(skip.float(), torch.cat(anchors, 0)) / (1. + torch.sum(skip))).unsqueeze(0) - else: - assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS." - logit = query.view(1, -1) - index = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) - skip = F.one_hot(index, num_classes=mutable.n_candidates).view(-1) - log_prob = self.cross_entropy_loss(logit, index) - self._inputs = anchors[index.item()] - - self.sample_log_prob += self.entropy_reduction(log_prob) - entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type - self.sample_entropy += self.entropy_reduction(entropy) - return skip.bool() diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py deleted file mode 100644 index 5e7a96658..000000000 --- a/nni/algorithms/nas/pytorch/enas/trainer.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -from itertools import cycle - -import torch -import torch.nn as nn -import torch.optim as optim - -from nni.nas.pytorch.trainer import Trainer -from nni.nas.pytorch.utils import AverageMeterGroup, to_device -from .mutator import EnasMutator - -logger = logging.getLogger(__name__) - - -class EnasTrainer(Trainer): - """ - ENAS trainer. - - Parameters - ---------- - model : nn.Module - PyTorch model to be trained. - loss : callable - Receives logits and ground truth label, return a loss tensor. - metrics : callable - Receives logits and ground truth label, return a dict of metrics. - reward_function : callable - Receives logits and ground truth label, return a tensor, which will be feeded to RL controller as reward. - optimizer : Optimizer - The optimizer used for optimizing the model. - num_epochs : int - Number of epochs planned for training. - dataset_train : Dataset - Dataset for training. Will be split for training weights and architecture weights. - dataset_valid : Dataset - Dataset for testing. - mutator : EnasMutator - Use when customizing your own mutator or a mutator with customized parameters. - batch_size : int - Batch size. - workers : int - Workers for data loading. - device : torch.device - ``torch.device("cpu")`` or ``torch.device("cuda")``. - log_frequency : int - Step count per logging. - callbacks : list of Callback - list of callbacks to trigger at events. - entropy_weight : float - Weight of sample entropy loss. - skip_weight : float - Weight of skip penalty loss. - baseline_decay : float - Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``. - child_steps : int - How many mini-batches for model training per epoch. - mutator_lr : float - Learning rate for RL controller. - mutator_steps_aggregate : int - Number of steps that will be aggregated into one mini-batch for RL controller. - mutator_steps : int - Number of mini-batches for each epoch of RL controller learning. - aux_weight : float - Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss. - test_arc_per_epoch : int - How many architectures are chosen for direct test after each epoch. - """ - def __init__(self, model, loss, metrics, reward_function, - optimizer, num_epochs, dataset_train, dataset_valid, - mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None, - entropy_weight=0.0001, skip_weight=0.8, baseline_decay=0.999, child_steps=500, - mutator_lr=0.00035, mutator_steps_aggregate=20, mutator_steps=50, aux_weight=0.4, - test_arc_per_epoch=1): - super().__init__(model, mutator if mutator is not None else EnasMutator(model), - loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, - batch_size, workers, device, log_frequency, callbacks) - self.reward_function = reward_function - self.mutator_optim = optim.Adam(self.mutator.parameters(), lr=mutator_lr) - self.batch_size = batch_size - self.workers = workers - - self.entropy_weight = entropy_weight - self.skip_weight = skip_weight - self.baseline_decay = baseline_decay - self.baseline = 0. - self.mutator_steps_aggregate = mutator_steps_aggregate - self.mutator_steps = mutator_steps - self.child_steps = child_steps - self.aux_weight = aux_weight - self.test_arc_per_epoch = test_arc_per_epoch - - self.init_dataloader() - - def init_dataloader(self): - n_train = len(self.dataset_train) - split = n_train // 10 - indices = list(range(n_train)) - train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split]) - valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:]) - self.train_loader = torch.utils.data.DataLoader(self.dataset_train, - batch_size=self.batch_size, - sampler=train_sampler, - num_workers=self.workers) - self.valid_loader = torch.utils.data.DataLoader(self.dataset_train, - batch_size=self.batch_size, - sampler=valid_sampler, - num_workers=self.workers) - self.test_loader = torch.utils.data.DataLoader(self.dataset_valid, - batch_size=self.batch_size, - num_workers=self.workers) - self.train_loader = cycle(self.train_loader) - self.valid_loader = cycle(self.valid_loader) - - def train_one_epoch(self, epoch): - # Sample model and train - self.model.train() - self.mutator.eval() - meters = AverageMeterGroup() - for step in range(1, self.child_steps + 1): - x, y = next(self.train_loader) - x, y = to_device(x, self.device), to_device(y, self.device) - self.optimizer.zero_grad() - - with torch.no_grad(): - self.mutator.reset() - self._write_graph_status() - logits = self.model(x) - - if isinstance(logits, tuple): - logits, aux_logits = logits - aux_loss = self.loss(aux_logits, y) - else: - aux_loss = 0. - metrics = self.metrics(logits, y) - loss = self.loss(logits, y) - loss = loss + self.aux_weight * aux_loss - loss.backward() - nn.utils.clip_grad_norm_(self.model.parameters(), 5.) - self.optimizer.step() - metrics["loss"] = loss.item() - meters.update(metrics) - - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Model Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, - self.num_epochs, step, self.child_steps, meters) - - # Train sampler (mutator) - self.model.eval() - self.mutator.train() - meters = AverageMeterGroup() - for mutator_step in range(1, self.mutator_steps + 1): - self.mutator_optim.zero_grad() - for step in range(1, self.mutator_steps_aggregate + 1): - x, y = next(self.valid_loader) - x, y = to_device(x, self.device), to_device(y, self.device) - - self.mutator.reset() - with torch.no_grad(): - logits = self.model(x) - self._write_graph_status() - metrics = self.metrics(logits, y) - reward = self.reward_function(logits, y) - if self.entropy_weight: - reward += self.entropy_weight * self.mutator.sample_entropy.item() - self.baseline = self.baseline * self.baseline_decay + reward * (1 - self.baseline_decay) - loss = self.mutator.sample_log_prob * (reward - self.baseline) - if self.skip_weight: - loss += self.skip_weight * self.mutator.sample_skip_penalty - metrics["reward"] = reward - metrics["loss"] = loss.item() - metrics["ent"] = self.mutator.sample_entropy.item() - metrics["log_prob"] = self.mutator.sample_log_prob.item() - metrics["baseline"] = self.baseline - metrics["skip"] = self.mutator.sample_skip_penalty - - loss /= self.mutator_steps_aggregate - loss.backward() - meters.update(metrics) - - cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate - if self.log_frequency is not None and cur_step % self.log_frequency == 0: - logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", epoch + 1, self.num_epochs, - mutator_step, self.mutator_steps, step, self.mutator_steps_aggregate, - meters) - - nn.utils.clip_grad_norm_(self.mutator.parameters(), 5.) - self.mutator_optim.step() - - def validate_one_epoch(self, epoch): - with torch.no_grad(): - for arc_id in range(self.test_arc_per_epoch): - meters = AverageMeterGroup() - for x, y in self.test_loader: - x, y = to_device(x, self.device), to_device(y, self.device) - self.mutator.reset() - logits = self.model(x) - if isinstance(logits, tuple): - logits, _ = logits - metrics = self.metrics(logits, y) - loss = self.loss(logits, y) - metrics["loss"] = loss.item() - meters.update(metrics) - - logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", - epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch, - meters.summary()) diff --git a/nni/algorithms/nas/pytorch/fbnet/__init__.py b/nni/algorithms/nas/pytorch/fbnet/__init__.py deleted file mode 100644 index 38d96327e..000000000 --- a/nni/algorithms/nas/pytorch/fbnet/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from __future__ import absolute_import - -from .mutator import FBNetMutator # noqa: F401 -from .trainer import FBNetTrainer # noqa: F401 -from .utils import ( # noqa: F401 - LookUpTable, - NASConfig, - RegularizerLoss, - model_init, - supernet_sample, -) diff --git a/nni/algorithms/nas/pytorch/fbnet/mutator.py b/nni/algorithms/nas/pytorch/fbnet/mutator.py deleted file mode 100644 index 914063b82..000000000 --- a/nni/algorithms/nas/pytorch/fbnet/mutator.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from __future__ import absolute_import, division, print_function - -import torch -from torch import nn as nn -from torch.nn import functional as F -import numpy as np - -from nni.nas.pytorch.base_mutator import BaseMutator -from nni.nas.pytorch.mutables import LayerChoice - - -class MixedOp(nn.Module): - """ - This class is to instantiate and manage info of one LayerChoice. - It includes architecture weights and member functions for the weights. - """ - - def __init__(self, mutable, latency): - """ - Parameters - ---------- - mutable : LayerChoice - A LayerChoice in user model - latency : List - performance cost for each op in mutable - """ - super(MixedOp, self).__init__() - self.latency = latency - n_choices = len(mutable) - self.path_alpha = nn.Parameter( - torch.FloatTensor([1.0 / n_choices for i in range(n_choices)]) - ) - self.path_alpha.requires_grad = False - self.temperature = 1.0 - - def get_path_alpha(self): - """Return the architecture parameter.""" - return self.path_alpha - - def get_weighted_latency(self): - """Return the weighted perf_cost of current mutable.""" - soft_masks = self.probs_over_ops() - weighted_latency = sum(m * l for m, l in zip(soft_masks, self.latency)) - return weighted_latency - - def set_temperature(self, temperature): - """ - Set the annealed temperature for gumbel softmax. - - Parameters - ---------- - temperature : float - The annealed temperature for gumbel softmax - """ - self.temperature = temperature - - def to_requires_grad(self): - """Enable gradient calculation.""" - self.path_alpha.requires_grad = True - - def to_disable_grad(self): - """Disable gradient calculation.""" - self.path_alpha.requires_grad = False - - def probs_over_ops(self): - """Apply gumbel softmax to generate probability distribution.""" - return F.gumbel_softmax(self.path_alpha, self.temperature) - - def forward(self, mutable, x): - """ - Define forward of LayerChoice. - - Parameters - ---------- - mutable : LayerChoice - this layer's mutable - x : tensor - inputs of this layer, only support one input - - Returns - ------- - output: tensor - output of this layer - """ - candidate_ops = list(mutable) - soft_masks = self.probs_over_ops() - output = sum(m * op(x) for m, op in zip(soft_masks, candidate_ops)) - - return output - - @property - def chosen_index(self): - """ - choose the op with max prob - - Returns - ------- - int - index of the chosen one - """ - alphas = self.path_alpha.data.detach().cpu().numpy() - index = int(np.argmax(alphas)) - return index - - -class FBNetMutator(BaseMutator): - """ - This mutator initializes and operates all the LayerChoices of the supernet. - It is for the related trainer to control the training flow of LayerChoices, - coordinating with whole training process. - """ - - def __init__(self, model, lookup_table): - """ - Init a MixedOp instance for each mutable i.e., LayerChoice. - And register the instantiated MixedOp in corresponding LayerChoice. - If does not register it in LayerChoice, DataParallel does'nt work then, - for architecture weights are not included in the DataParallel model. - When MixedOPs are registered, we use ```requires_grad``` to control - whether calculate gradients of architecture weights. - - Parameters - ---------- - model : pytorch model - The model that users want to tune, - it includes search space defined with nni nas apis - lookup_table : class - lookup table object to manage model space information, - including candidate ops for each stage as the model space, - input channels/output channels/stride/fm_size as the layer config, - and the performance information for perf_cost accumulation. - - """ - super(FBNetMutator, self).__init__(model) - self.mutable_list = [] - - # Collect the op names of the candidate ops within each mutable - ops_names_mutable = dict() - left = 0 - right = 1 - for stage_name in lookup_table.layer_num: - right = lookup_table.layer_num[stage_name] - stage_ops = lookup_table.lut_ops[stage_name] - ops_names = [op_name for op_name in stage_ops] - - for i in range(left, left + right): - ops_names_mutable[i] = ops_names - left += right - - # Create the mixed op - for i, mutable in enumerate(self.undedup_mutables): - ops_names = ops_names_mutable[i] - latency_mutable = lookup_table.lut_perf[i] - latency = [latency_mutable[op_name] for op_name in ops_names] - self.mutable_list.append(mutable) - mutable.registered_module = MixedOp(mutable, latency) - - def on_forward_layer_choice(self, mutable, *args, **kwargs): - """ - Callback of layer choice forward. This function defines the forward - logic of the input mutable. So mutable is only interface, its real - implementation is defined in mutator. - - Parameters - ---------- - mutable: LayerChoice - forward logic of this input mutable - args: list of torch.Tensor - inputs of this mutable - kwargs: dict - inputs of this mutable - - Returns - ------- - torch.Tensor - output of this mutable, i.e., LayerChoice - int - index of the chosen op - """ - # FIXME: return mask, to be consistent with other algorithms - idx = mutable.registered_module.chosen_index - return mutable.registered_module(mutable, *args, **kwargs), idx - - def num_arch_params(self): - """ - The number of mutables, i.e., LayerChoice - - Returns - ------- - int - the number of LayerChoice in user model - """ - return len(self.mutable_list) - - def get_architecture_parameters(self): - """ - Get all the architecture parameters. - - yield - ----- - PyTorch Parameter - Return path_alpha of the traversed mutable - """ - for mutable in self.undedup_mutables: - yield mutable.registered_module.get_path_alpha() - - def get_weighted_latency(self): - """ - Get the latency weighted by gumbel softmax coefficients. - - yield - ----- - Tuple - Return the weighted_latency of the traversed mutable - """ - for mutable in self.undedup_mutables: - yield mutable.registered_module.get_weighted_latency() - - def set_temperature(self, temperature): - """ - Set the annealed temperature of the op for gumbel softmax. - - Parameters - ---------- - temperature : float - The annealed temperature for gumbel softmax - """ - for mutable in self.undedup_mutables: - mutable.registered_module.set_temperature(temperature) - - def arch_requires_grad(self): - """ - Make architecture weights require gradient - """ - for mutable in self.undedup_mutables: - mutable.registered_module.to_requires_grad() - - def arch_disable_grad(self): - """ - Disable gradient of architecture weights, i.e., does not - calculate gradient for them. - """ - for mutable in self.undedup_mutables: - mutable.registered_module.to_disable_grad() - - def sample_final(self): - """ - Generate the final chosen architecture. - - Returns - ------- - dict - the choice of each mutable, i.e., LayerChoice - """ - result = dict() - for mutable in self.undedup_mutables: - assert isinstance(mutable, LayerChoice) - index = mutable.registered_module.chosen_index - # pylint: disable=not-callable - result[mutable.key] = ( - F.one_hot(torch.tensor(index), num_classes=len(mutable)) - .view(-1) - .bool(), - ) - return result diff --git a/nni/algorithms/nas/pytorch/fbnet/trainer.py b/nni/algorithms/nas/pytorch/fbnet/trainer.py deleted file mode 100644 index 1eaababef..000000000 --- a/nni/algorithms/nas/pytorch/fbnet/trainer.py +++ /dev/null @@ -1,413 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from __future__ import absolute_import, division, print_function - -import json -import os -import time -import torch - -import numpy as np - -from torch.autograd import Variable -from nni.nas.pytorch.base_trainer import BaseTrainer -from nni.nas.pytorch.trainer import TorchTensorEncoder -from nni.nas.pytorch.utils import AverageMeter -from .mutator import FBNetMutator -from .utils import RegularizerLoss, accuracy - - -class FBNetTrainer(BaseTrainer): - def __init__( - self, - model, - model_optim, - criterion, - device, - device_ids, - lookup_table, - train_loader, - valid_loader, - n_epochs=120, - load_ckpt=False, - arch_path=None, - logger=None, - ): - """ - Parameters - ---------- - model : pytorch model - the user model, which has mutables - model_optim : pytorch optimizer - the user defined optimizer - criterion : pytorch loss - the main task loss, nn.CrossEntropyLoss() is for classification - device : pytorch device - the devices to train/search the model - device_ids : list of int - the indexes of devices used for training - lookup_table : class - lookup table object for fbnet training - train_loader : pytorch data loader - data loader for the training set - valid_loader : pytorch data loader - data loader for the validation set - n_epochs : int - number of epochs to train/search - load_ckpt : bool - whether load checkpoint - arch_path : str - the path to store chosen architecture - logger : logger - the logger - """ - self.model = model - self.model_optim = model_optim - self.train_loader = train_loader - self.valid_loader = valid_loader - self.device = device - self.dev_num = len(device_ids) - self.n_epochs = n_epochs - self.lookup_table = lookup_table - self.config = lookup_table.config - self.start_epoch = self.config.start_epoch - self.temp = self.config.init_temperature - self.exp_anneal_rate = self.config.exp_anneal_rate - self.mode = self.config.mode - - self.load_ckpt = load_ckpt - self.arch_path = arch_path - self.logger = logger - - # scheduler of learning rate - self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - model_optim, T_max=n_epochs, last_epoch=-1 - ) - - # init mutator - self.mutator = FBNetMutator(model, lookup_table) - self.mutator.set_temperature(self.temp) - - # DataParallel should be put behind the init of mutator - self.model = torch.nn.DataParallel(self.model, device_ids=device_ids) - self.model.to(device) - - # build architecture optimizer - self.arch_optimizer = torch.optim.AdamW( - self.mutator.get_architecture_parameters(), - self.config.nas_lr, - weight_decay=self.config.nas_weight_decay, - ) - self.reg_loss = RegularizerLoss(config=self.config) - - self.criterion = criterion - self.epoch = 0 - - def _layer_choice_sample(self): - """ - Sample the index of network within layer choice - """ - stages = [stage_name for stage_name in self.lookup_table.layer_num] - stage_lnum = [self.lookup_table.layer_num[stage] for stage in stages] - - # get the choice idx in each layer - choice_ids = list() - layer_id = 0 - for param in self.mutator.get_architecture_parameters(): - param_np = param.cpu().detach().numpy() - op_idx = np.argmax(param_np) - choice_ids.append(op_idx) - self.logger.info( - "layer {}: {}, index: {}".format(layer_id, param_np, op_idx) - ) - layer_id += 1 - - # get the arch_sample - choice_names = list() - layer_id = 0 - for i, stage_name in enumerate(stages): - ops_names = [op for op in self.lookup_table.lut_ops[stage_name]] - for _ in range(stage_lnum[i]): - searched_op = ops_names[choice_ids[layer_id]] - choice_names.append(searched_op) - layer_id += 1 - - self.logger.info(choice_names) - return choice_names - - def _get_perf_cost(self, requires_grad=True): - """ - Get the accumulated performance cost. - """ - perf_cost = Variable( - torch.zeros(1), requires_grad=requires_grad - ).to(self.device, non_blocking=True) - - for latency in self.mutator.get_weighted_latency(): - perf_cost = perf_cost + latency - - return perf_cost - - def _validate(self): - """ - Do validation. During validation, LayerChoices use the mixed-op. - - Returns - ------- - float, float, float - average loss, average top1 accuracy, average top5 accuracy - """ - self.valid_loader.batch_sampler.drop_last = False - batch_time = AverageMeter("batch_time") - losses = AverageMeter("losses") - top1 = AverageMeter("top1") - top5 = AverageMeter("top5") - - # test on validation set under eval mode - self.model.eval() - - end = time.time() - with torch.no_grad(): - for i, (images, labels) in enumerate(self.valid_loader): - images = images.to(self.device, non_blocking=True) - labels = labels.to(self.device, non_blocking=True) - - output = self.model(images) - - loss = self.criterion(output, labels) - acc1, acc5 = accuracy(output, labels, topk=(1, 5)) - losses.update(loss, images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % 10 == 0 or i + 1 == len(self.valid_loader): - test_log = ( - "Valid" + ": [{0}/{1}]\t" - "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" - "Loss {loss.val:.4f} ({loss.avg:.4f})\t" - "Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t" - "Top-5 acc {top5.val:.3f} ({top5.avg:.3f})".format( - i, - len(self.valid_loader) - 1, - batch_time=batch_time, - loss=losses, - top1=top1, - top5=top5, - ) - ) - self.logger.info(test_log) - - return losses.avg, top1.avg, top5.avg - - def _train_epoch(self, epoch, optimizer, arch_train=False): - """ - Train one epoch. - """ - batch_time = AverageMeter("batch_time") - data_time = AverageMeter("data_time") - losses = AverageMeter("losses") - top1 = AverageMeter("top1") - top5 = AverageMeter("top5") - - # switch to train mode - self.model.train() - - data_loader = self.valid_loader if arch_train else self.train_loader - end = time.time() - for i, (images, labels) in enumerate(data_loader): - data_time.update(time.time() - end) - images = images.to(self.device, non_blocking=True) - labels = labels.to(self.device, non_blocking=True) - - output = self.model(images) - loss = self.criterion(output, labels) - - # hardware-aware loss - perf_cost = self._get_perf_cost(requires_grad=True) - regu_loss = self.reg_loss(perf_cost) - if self.mode.startswith("mul"): - loss = loss * regu_loss - elif self.mode.startswith("add"): - loss = loss + regu_loss - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, labels, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0].item(), images.size(0)) - top5.update(acc5[0].item(), images.size(0)) - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % 10 == 0: - batch_log = ( - "Warmup Train [{0}][{1}]\t" - "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" - "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" - "Loss {losses.val:.4f} ({losses.avg:.4f})\t" - "Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t" - "Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\t".format( - epoch + 1, - i, - batch_time=batch_time, - data_time=data_time, - losses=losses, - top1=top1, - top5=top5, - ) - ) - self.logger.info(batch_log) - - def _warm_up(self): - """ - Warm up the model, while the architecture weights are not trained. - """ - for epoch in range(self.epoch, self.start_epoch): - self.logger.info("\n--------Warmup epoch: %d--------\n", epoch + 1) - self._train_epoch(epoch, self.model_optim) - # adjust learning rate - self.scheduler.step() - - # validation - val_loss, val_top1, val_top5 = self._validate() - val_log = ( - "Warmup Valid [{0}/{1}]\t" - "loss {2:.3f}\ttop-1 acc {3:.3f}\ttop-5 acc {4:.3f}".format( - epoch + 1, self.warmup_epochs, val_loss, val_top1, val_top5 - ) - ) - self.logger.info(val_log) - - if epoch % 10 == 0: - filename = os.path.join( - self.config.model_dir, "checkpoint_%s.pth" % epoch - ) - self.save_checkpoint(epoch, filename) - - def _train(self): - """ - Train the model, it trains model weights and architecute weights. - Architecture weights are trained according to the schedule. - Before updating architecture weights, ```requires_grad``` is enabled. - Then, it is disabled after the updating, in order not to update - architecture weights when training model weights. - """ - arch_param_num = self.mutator.num_arch_params() - self.logger.info("#arch_params: {}".format(arch_param_num)) - self.epoch = max(self.start_epoch, self.epoch) - - ckpt_path = self.config.model_dir - choice_names = None - top1_best = 0.0 - - for epoch in range(self.epoch, self.n_epochs): - self.logger.info("\n--------Train epoch: %d--------\n", epoch + 1) - # update the weight parameters - self._train_epoch(epoch, self.model_optim) - # adjust learning rate - self.scheduler.step() - - self.logger.info("Update architecture parameters") - # update the architecture parameters - self.mutator.arch_requires_grad() - self._train_epoch(epoch, self.arch_optimizer, True) - self.mutator.arch_disable_grad() - # temperature annealing - self.temp = self.temp * self.exp_anneal_rate - self.mutator.set_temperature(self.temp) - # sample the architecture of sub-network - choice_names = self._layer_choice_sample() - - # validate - val_loss, val_top1, val_top5 = self._validate() - val_log = ( - "Valid [{0}]\t" - "loss {1:.3f}\ttop-1 acc {2:.3f} \ttop-5 acc {3:.3f}".format( - epoch + 1, val_loss, val_top1, val_top5 - ) - ) - self.logger.info(val_log) - - if epoch % 10 == 0: - filename = os.path.join(ckpt_path, "checkpoint_%s.pth" % epoch) - self.save_checkpoint(epoch, filename, choice_names) - - val_top1 = val_top1.cpu().as_numpy() - if val_top1 > top1_best: - filename = os.path.join(ckpt_path, "checkpoint_best.pth") - self.save_checkpoint(epoch, filename, choice_names) - top1_best = val_top1 - - def save_checkpoint(self, epoch, filename, choice_names=None): - """ - Save checkpoint of the whole model. - Saving model weights and architecture weights as ```filename```, - and saving currently chosen architecture in ```arch_path```. - """ - state = { - "model": self.model.state_dict(), - "optim": self.model_optim.state_dict(), - "epoch": epoch, - "arch_sample": choice_names, - } - torch.save(state, filename) - self.logger.info("Save checkpoint to {0:}".format(filename)) - - if self.arch_path: - self.export(self.arch_path) - - def load_checkpoint(self, filename): - """ - Load the checkpoint from ```ckpt_path```. - """ - ckpt = torch.load(filename) - self.epoch = ckpt["epoch"] - self.model.load_state_dict(ckpt["model"]) - self.model_optim.load_state_dict(ckpt["optim"]) - - def train(self): - """ - Train the whole model. - """ - if self.load_ckpt: - ckpt_path = self.config.model_dir - filename = os.path.join(ckpt_path, "checkpoint_best.pth") - if os.path.exists(filename): - self.load_checkpoint(filename) - - if self.epoch < self.start_epoch: - self._warm_up() - self._train() - - def export(self, file_name): - """ - Export the chosen architecture into a file - - Parameters - ---------- - file_name : str - the file that stores exported chosen architecture - """ - exported_arch = self.mutator.sample_final() - with open(file_name, "w") as f: - json.dump( - exported_arch, - f, - indent=2, - sort_keys=True, - cls=TorchTensorEncoder, - ) - - def validate(self): - raise NotImplementedError - - def checkpoint(self): - raise NotImplementedError diff --git a/nni/algorithms/nas/pytorch/fbnet/utils.py b/nni/algorithms/nas/pytorch/fbnet/utils.py deleted file mode 100644 index 77e71746b..000000000 --- a/nni/algorithms/nas/pytorch/fbnet/utils.py +++ /dev/null @@ -1,433 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from __future__ import absolute_import, division, print_function - -import ast -import os -import timeit -import torch - -import numpy as np -import torch.nn as nn - -from nni.compression.pytorch.utils import count_flops_params - -LUT_FILE = "lut.npy" -LUT_JSON_FILE = "lut.txt" -LUT_PATH = "lut" - -DATA_TYPE = "float" - -class NASConfig: - def __init__( - self, - perf_metric="flops", - lut_load=False, - lut_load_format="json", - model_dir=None, - nas_lr=0.01, - nas_weight_decay=5e-4, - mode="mul", - alpha=0.25, - beta=0.6, - start_epoch=50, - init_temperature=5.0, - exp_anneal_rate=np.exp(-0.045), - search_space=None, - ): - # LUT of performance metric - # flops means the multiplies, latency means the time cost on platform - self.perf_metric = perf_metric - assert perf_metric in [ - "flops", - "latency", - ], "perf_metric should be ['flops', 'latency']" - # wether load or create lut file - self.lut_load = lut_load - - assert lut_load_format in [ - "json", - "numpy", - ], "lut_load_format should be ['json', 'numpy']" - self.lut_load_format = lut_load_format - - # necessary dirs - self.lut_en = model_dir is not None - if self.lut_en: - self.model_dir = model_dir - os.makedirs(model_dir, exist_ok=True) - self.lut_path = os.path.join(model_dir, LUT_PATH) - os.makedirs(self.lut_path, exist_ok=True) - # NAS learning setting - self.nas_lr = nas_lr - self.nas_weight_decay = nas_weight_decay - # hardware-aware loss setting - self.mode = mode - assert mode in ["mul", "add"], "mode should be ['mul', 'add']" - self.alpha = alpha - self.beta = beta - # NAS training setting - self.start_epoch = start_epoch - self.init_temperature = init_temperature - self.exp_anneal_rate = exp_anneal_rate - # definition of search blocks and space - self.search_space = search_space - - -class RegularizerLoss(nn.Module): - """Auxilliary loss for hardware-aware NAS.""" - - def __init__(self, config): - """ - Parameters - ---------- - config : class - to manage the configuration for NAS training, and search space etc. - """ - super(RegularizerLoss, self).__init__() - self.mode = config.mode - self.alpha = config.alpha - self.beta = config.beta - - def forward(self, perf_cost, batch_size=1): - """ - Parameters - ---------- - perf_cost : tensor - the accumulated performance cost - batch_size : int - batch size for normalization - - Returns - ------- - output: tensor - the hardware-aware constraint loss - """ - if self.mode == "mul": - log_loss = torch.log(perf_cost / batch_size) ** self.beta - return self.alpha * log_loss - elif self.mode == "add": - linear_loss = (perf_cost / batch_size) ** self.beta - return self.alpha * linear_loss - else: - raise NotImplementedError - - -def accuracy(output, target, topk=(1,)): - """ - Computes the precision@k for the specified values of k - - Parameters - ---------- - output : pytorch tensor - output, e.g., predicted value - target : pytorch tensor - label - topk : tuple - specify top1 and top5 - - Returns - ------- - list - accuracy of top1 and top5 - """ - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -def supernet_sample(model, state_dict, sampled_arch=[], lookup_table=None): - """ - Initialize the searched sub-model from supernet. - - Parameters - ---------- - model : pytorch model - the created subnet - state_dict : checkpoint - the checkpoint of supernet, including the pre-trained params - sampled_arch : list of str - the searched layer names of the subnet - lookup_table : class - to manage the candidate ops, layer information and layer performance - """ - replace = list() - stages = [stage for stage in lookup_table.layer_num] - stage_lnum = [lookup_table.layer_num[stage] for stage in stages] - - if sampled_arch: - layer_id = 0 - for i, stage in enumerate(stages): - ops_names = [op_name for op_name in lookup_table.lut_ops[stage]] - for _ in range(stage_lnum[i]): - searched_op = sampled_arch[layer_id] - op_i = ops_names.index(searched_op) - replace.append( - [ - "blocks.{}.".format(layer_id), - "blocks.{}.op.".format(layer_id), - "blocks.{}.{}.".format(layer_id, op_i), - ] - ) - layer_id += 1 - model_init(model, state_dict, replace=replace) - - -def model_init(model, state_dict, replace=[]): - """Initialize the model from state_dict.""" - prefix = "module." - param_dict = dict() - for k, v in state_dict.items(): - if k.startswith(prefix): - k = k[7:] - param_dict[k] = v - - for k, (name, m) in enumerate(model.named_modules()): - if replace: - for layer_replace in replace: - assert len(layer_replace) == 3, "The elements should be three." - pre_scope, key, replace_key = layer_replace - if pre_scope in name: - name = name.replace(key, replace_key) - - # Copy the state_dict to current model - if (name + ".weight" in param_dict) or ( - name + ".running_mean" in param_dict - ): - if isinstance(m, nn.BatchNorm2d): - shape = m.running_mean.shape - if shape == param_dict[name + ".running_mean"].shape: - if m.weight is not None: - m.weight.data = param_dict[name + ".weight"] - m.bias.data = param_dict[name + ".bias"] - m.running_mean = param_dict[name + ".running_mean"] - m.running_var = param_dict[name + ".running_var"] - - elif isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): - shape = m.weight.data.shape - if shape == param_dict[name + ".weight"].shape: - m.weight.data = param_dict[name + ".weight"] - if m.bias is not None: - m.bias.data = param_dict[name + ".bias"] - - elif isinstance(m, nn.ConvTranspose2d): - m.weight.data = param_dict[name + ".weight"] - if m.bias is not None: - m.bias.data = param_dict[name + ".bias"] - - -class LookUpTable: - """Build look-up table for NAS.""" - - def __init__(self, config, primitives): - """ - Parameters - ---------- - config : class - to manage the configuration for NAS training, and search space etc. - """ - self.config = config - # definition of search blocks and space - self.search_space = config.search_space - # layers for NAS - self.cnt_layers = len(self.search_space["input_shape"]) - # constructors for each operation - self.lut_ops = { - stage_name: { - op_name: primitives[op_name] - for op_name in self.search_space["stages"][stage_name]["ops"] - } - for stage_name in self.search_space["stages"] - } - self.layer_num = { - stage_name: self.search_space["stages"][stage_name]["layer_num"] - for stage_name in self.search_space["stages"] - } - - # arguments for the ops constructors, input_shapes just for convinience - self.layer_configs, self.layer_in_shapes = self._layer_configs() - - # lookup_table - self.perf_metric = config.perf_metric - - if config.lut_en: - self.lut_perf = None - self.lut_file = os.path.join(config.lut_path, LUT_FILE) - self.lut_json_file = LUT_JSON_FILE - if config.lut_load: - if config.lut_load_format == "numpy": - # Load data from numpy file - self._load_from_file() - else: - # Load data from json file - self._load_from_json_file() - else: - self._create_perfs() - - def _layer_configs(self): - """Generate basic params for different layers.""" - # layer_configs are : c_in, c_out, stride, fm_size - layer_configs = [ - [ - self.search_space["input_shape"][layer_id][0], - self.search_space["channel_size"][layer_id], - self.search_space["strides"][layer_id], - self.search_space["fm_size"][layer_id], - ] - for layer_id in range(self.cnt_layers) - ] - - # layer_in_shapes are (C_in, input_w, input_h) - layer_in_shapes = self.search_space["input_shape"] - - return layer_configs, layer_in_shapes - - def _create_perfs(self, cnt_of_runs=200): - """Create performance cost for each op.""" - if self.perf_metric == "latency": - self.lut_perf = self._calculate_latency(cnt_of_runs) - elif self.perf_metric == "flops": - self.lut_perf = self._calculate_flops() - - self._write_lut_to_file() - - def _calculate_flops(self, eps=0.001): - """FLOPs cost.""" - flops_lut = [{} for i in range(self.cnt_layers)] - layer_id = 0 - - for stage_name in self.lut_ops: - stage_ops = self.lut_ops[stage_name] - ops_num = self.layer_num[stage_name] - - for _ in range(ops_num): - for op_name in stage_ops: - layer_config = self.layer_configs[layer_id] - key_params = {"fm_size": layer_config[3]} - op = stage_ops[op_name](*layer_config[0:3], **key_params) - - # measured in Flops - in_shape = self.layer_in_shapes[layer_id] - x = (1, in_shape[0], in_shape[1], in_shape[2]) - flops, _, _ = count_flops_params(op, x, verbose=False) - flops = eps if flops == 0.0 else flops - flops_lut[layer_id][op_name] = float(flops) - layer_id += 1 - - return flops_lut - - def _calculate_latency(self, cnt_of_runs): - """Latency cost.""" - LATENCY_BATCH_SIZE = 1 - latency_lut = [{} for i in range(self.cnt_layers)] - layer_id = 0 - - for stage_name in self.lut_ops: - stage_ops = self.lut_ops[stage_name] - ops_num = self.layer_num[stage_name] - - for _ in range(ops_num): - for op_name in stage_ops: - layer_config = self.layer_configs[layer_id] - key_params = {"fm_size": layer_config[3]} - op = stage_ops[op_name](*layer_config[0:3], **key_params) - input_data = torch.randn( - (LATENCY_BATCH_SIZE, *self.layer_in_shapes[layer_id]) - ) - globals()["op"], globals()["input_data"] = op, input_data - total_time = timeit.timeit( - "output = op(input_data)", - setup="gc.enable()", - globals=globals(), - number=cnt_of_runs, - ) - # measured in micro-second - latency_lut[layer_id][op_name] = ( - total_time / cnt_of_runs / LATENCY_BATCH_SIZE * 1e6 - ) - layer_id += 1 - - return latency_lut - - def _write_lut_to_file(self): - """Save lut as numpy file.""" - np.save(self.lut_file, self.lut_perf) - - def _load_from_file(self): - """Load numpy file.""" - self.lut_perf = np.load(self.lut_file, allow_pickle=True) - - def _load_from_json_file(self): - """Load json file.""" - - """ - lut_json_file ('lut.txt') format: - {'op_name': operator_name, - 'op_data_shape': (input_w, input_h, C_in, C_out, stride), - 'op_dtype': data_type, - 'op_latency': latency} - {...} - {...} - """ - latency_file = open(self.lut_json_file, "r") - ops_latency = latency_file.readlines() - - """ops_lut: {'op_name': {'op_data_shape': {'op_dtype': latency}}}""" - ops_lut = {} - - for op_latency in ops_latency: - assert isinstance(op_latency, str) or isinstance(op_latency, dict) - - if isinstance(op_latency, str): - record = ast.literal_eval(op_latency) - elif isinstance(op_latency, dict): - record = op_latency - - op_name = record["op_name"] - """op_data_shape: (input_w, input_h, C_in, C_out, stride)""" - op_data_shape = record["op_data_shape"] - op_dtype = record["op_dtype"] - op_latency = record["op_latency"] - - if op_name not in ops_lut: - ops_lut[op_name] = {} - - if op_data_shape not in ops_lut[op_name]: - ops_lut[op_name][op_data_shape] = {} - - ops_lut[op_name][op_data_shape][op_dtype] = op_latency - - self.lut_perf = [{} for i in range(self.cnt_layers)] - layer_id = 0 - - for stage_name in self.lut_ops: - stage_ops = self.lut_ops[stage_name] - ops_num = self.layer_num[stage_name] - - for _ in range(ops_num): - for op_name in stage_ops: - layer_config = self.layer_configs[layer_id] - layer_in_shape = self.layer_in_shapes[layer_id] - - input_w = layer_in_shape[1] - input_h = layer_in_shape[2] - c_in = layer_config[0] - c_out = layer_config[1] - stride = layer_config[2] - op_data_shape = (input_w, input_h, c_in, c_out, stride) - - if op_name in ops_lut and op_data_shape in ops_lut[op_name]: - self.lut_perf[layer_id][op_name] = \ - ops_lut[op_name][op_data_shape][DATA_TYPE] - - layer_id += 1 diff --git a/nni/algorithms/nas/pytorch/pdarts/__init__.py b/nni/algorithms/nas/pytorch/pdarts/__init__.py deleted file mode 100644 index d1d17764b..000000000 --- a/nni/algorithms/nas/pytorch/pdarts/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .trainer import PdartsTrainer diff --git a/nni/algorithms/nas/pytorch/pdarts/mutator.py b/nni/algorithms/nas/pytorch/pdarts/mutator.py deleted file mode 100644 index 09ad51c5e..000000000 --- a/nni/algorithms/nas/pytorch/pdarts/mutator.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import copy - -import numpy as np -import torch -from torch import nn - -from nni.algorithms.nas.pytorch.darts import DartsMutator -from nni.nas.pytorch.mutables import LayerChoice - - -class PdartsMutator(DartsMutator): - """ - It works with PdartsTrainer to calculate ops weights, - and drop weights in different PDARTS epochs. - """ - - def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}): - self.pdarts_epoch_index = pdarts_epoch_index - self.pdarts_num_to_drop = pdarts_num_to_drop - if switches is None: - self.switches = {} - else: - self.switches = switches - - super(PdartsMutator, self).__init__(model) - - # this loop go through mutables with different keys, - # it's mainly to update length of choices. - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - - switches = self.switches.get(mutable.key, [True for j in range(len(mutable))]) - choices = self.choices[mutable.key] - - operations_count = np.sum(switches) - # +1 and -1 are caused by zero operation in darts network - # the zero operation is not in choices list in network, but its weight are in, - # so it needs one more weights and switch for zero. - self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(operations_count + 1)) - self.switches[mutable.key] = switches - - # update LayerChoice instances in model, - # it's physically remove dropped choices operations. - for module in self.model.modules(): - if isinstance(module, LayerChoice): - switches = self.switches.get(module.key) - choices = self.choices[module.key] - if len(module) > len(choices): - # from last to first, so that it won't effect previous indexes after removed one. - for index in range(len(switches)-1, -1, -1): - if switches[index] == False: - del module[index] - assert len(module) <= len(choices), "Failed to remove dropped choices." - - def export(self): - # Cannot rely on super().export() because P-DARTS has deleted some of the choices and has misaligned length. - results = super().sample_final() - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - # As some operations are dropped physically, - # so it needs to fill back false to track dropped operations. - trained_result = results[mutable.key] - trained_index = 0 - switches = self.switches[mutable.key] - result = torch.Tensor(switches).bool() - for index in range(len(result)): - if result[index]: - result[index] = trained_result[trained_index] - trained_index += 1 - results[mutable.key] = result - return results - - def drop_paths(self): - """ - This method is called when a PDARTS epoch is finished. - It prepares switches for next epoch. - candidate operations with False switch will be doppped in next epoch. - """ - all_switches = copy.deepcopy(self.switches) - for key in all_switches: - switches = all_switches[key] - idxs = [] - for j in range(len(switches)): - if switches[j]: - idxs.append(j) - sorted_weights = self.choices[key].data.cpu().numpy()[:-1] - drop = np.argsort(sorted_weights)[:self.pdarts_num_to_drop[self.pdarts_epoch_index]] - for idx in drop: - switches[idxs[idx]] = False - return all_switches diff --git a/nni/algorithms/nas/pytorch/pdarts/trainer.py b/nni/algorithms/nas/pytorch/pdarts/trainer.py deleted file mode 100644 index 7f23a6e22..000000000 --- a/nni/algorithms/nas/pytorch/pdarts/trainer.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging - -from nni.nas.pytorch.callbacks import LRSchedulerCallback -from nni.algorithms.nas.pytorch.darts import DartsTrainer -from nni.nas.pytorch.trainer import BaseTrainer, TorchTensorEncoder - -from .mutator import PdartsMutator - -logger = logging.getLogger(__name__) - - -class PdartsTrainer(BaseTrainer): - """ - This trainer implements the PDARTS algorithm. - PDARTS bases on DARTS algorithm, and provides a network growth approach to find deeper and better network. - This class relies on pdarts_num_layers and pdarts_num_to_drop parameters to control how network grows. - pdarts_num_layers means how many layers more than first epoch. - pdarts_num_to_drop means how many candidate operations should be dropped in each epoch. - So that the grew network can in similar size. - """ - - def __init__(self, model_creator, init_layers, metrics, - num_epochs, dataset_train, dataset_valid, - pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 1], - mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None, unrolled=False): - super(PdartsTrainer, self).__init__() - self.model_creator = model_creator - self.init_layers = init_layers - self.pdarts_num_layers = pdarts_num_layers - self.pdarts_num_to_drop = pdarts_num_to_drop - self.pdarts_epoch = len(pdarts_num_to_drop) - self.darts_parameters = { - "metrics": metrics, - "num_epochs": num_epochs, - "dataset_train": dataset_train, - "dataset_valid": dataset_valid, - "batch_size": batch_size, - "workers": workers, - "device": device, - "log_frequency": log_frequency, - "unrolled": unrolled - } - self.callbacks = callbacks if callbacks is not None else [] - - def train(self): - - switches = None - for epoch in range(self.pdarts_epoch): - - layers = self.init_layers+self.pdarts_num_layers[epoch] - model, criterion, optim, lr_scheduler = self.model_creator(layers) - self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches) - - for callback in self.callbacks: - callback.build(model, self.mutator, self) - callback.on_epoch_begin(epoch) - - darts_callbacks = [] - if lr_scheduler is not None: - darts_callbacks.append(LRSchedulerCallback(lr_scheduler)) - - self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim, - callbacks=darts_callbacks, **self.darts_parameters) - logger.info("start pdarts training epoch %s...", epoch) - - self.trainer.train() - - switches = self.mutator.drop_paths() - - for callback in self.callbacks: - callback.on_epoch_end(epoch) - - def validate(self): - self.trainer.validate() - - def export(self, file): - mutator_export = self.mutator.export() - with open(file, "w") as f: - json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder) - - def checkpoint(self): - raise NotImplementedError("Not implemented yet") diff --git a/nni/algorithms/nas/pytorch/proxylessnas/__init__.py b/nni/algorithms/nas/pytorch/proxylessnas/__init__.py deleted file mode 100644 index 3188fbf45..000000000 --- a/nni/algorithms/nas/pytorch/proxylessnas/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import ProxylessNasMutator -from .trainer import ProxylessNasTrainer diff --git a/nni/algorithms/nas/pytorch/proxylessnas/mutator.py b/nni/algorithms/nas/pytorch/proxylessnas/mutator.py deleted file mode 100644 index 881a6b440..000000000 --- a/nni/algorithms/nas/pytorch/proxylessnas/mutator.py +++ /dev/null @@ -1,478 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import math -import torch -from torch import nn as nn -from torch.nn import functional as F -import numpy as np - -from nni.nas.pytorch.base_mutator import BaseMutator -from nni.nas.pytorch.mutables import LayerChoice -from .utils import detach_variable - -class ArchGradientFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, x, binary_gates, run_func, backward_func): - ctx.run_func = run_func - ctx.backward_func = backward_func - - detached_x = detach_variable(x) - with torch.enable_grad(): - output = run_func(detached_x) - ctx.save_for_backward(detached_x, output) - return output.data - - @staticmethod - def backward(ctx, grad_output): - detached_x, output = ctx.saved_tensors - - grad_x = torch.autograd.grad(output, detached_x, grad_output, only_inputs=True) - # compute gradients w.r.t. binary_gates - binary_grads = ctx.backward_func(detached_x.data, output.data, grad_output.data) - - return grad_x[0], binary_grads, None, None - -class MixedOp(nn.Module): - """ - This class is to instantiate and manage info of one LayerChoice. - It includes architecture weights, binary weights, and member functions - operating the weights. - - forward_mode: - forward/backward mode for LayerChoice: None, two, full, and full_v2. - For training architecture weights, we use full_v2 by default, and for training - model weights, we use None. - """ - forward_mode = None - def __init__(self, mutable): - """ - Parameters - ---------- - mutable : LayerChoice - A LayerChoice in user model - """ - super(MixedOp, self).__init__() - self.ap_path_alpha = nn.Parameter(torch.Tensor(len(mutable))) - self.ap_path_wb = nn.Parameter(torch.Tensor(len(mutable))) - self.ap_path_alpha.requires_grad = False - self.ap_path_wb.requires_grad = False - self.active_index = [0] - self.inactive_index = None - self.log_prob = None - self.current_prob_over_ops = None - self.n_choices = len(mutable) - - def get_ap_path_alpha(self): - return self.ap_path_alpha - - def to_requires_grad(self): - self.ap_path_alpha.requires_grad = True - self.ap_path_wb.requires_grad = True - - def to_disable_grad(self): - self.ap_path_alpha.requires_grad = False - self.ap_path_wb.requires_grad = False - - def forward(self, mutable, x): - """ - Define forward of LayerChoice. For 'full_v2', backward is also defined. - The 'two' mode is explained in section 3.2.1 in the paper. - The 'full_v2' mode is explained in Appendix D in the paper. - - Parameters - ---------- - mutable : LayerChoice - this layer's mutable - x : tensor - inputs of this layer, only support one input - - Returns - ------- - output: tensor - output of this layer - """ - if MixedOp.forward_mode == 'full' or MixedOp.forward_mode == 'two': - output = 0 - for _i in self.active_index: - oi = self.candidate_ops[_i](x) - output = output + self.ap_path_wb[_i] * oi - for _i in self.inactive_index: - oi = self.candidate_ops[_i](x) - output = output + self.ap_path_wb[_i] * oi.detach() - elif MixedOp.forward_mode == 'full_v2': - def run_function(key, candidate_ops, active_id): - def forward(_x): - return candidate_ops[active_id](_x) - return forward - - def backward_function(key, candidate_ops, active_id, binary_gates): - def backward(_x, _output, grad_output): - binary_grads = torch.zeros_like(binary_gates.data) - with torch.no_grad(): - for k in range(len(candidate_ops)): - if k != active_id: - out_k = candidate_ops[k](_x.data) - else: - out_k = _output.data - grad_k = torch.sum(out_k * grad_output) - binary_grads[k] = grad_k - return binary_grads - return backward - output = ArchGradientFunction.apply( - x, self.ap_path_wb, run_function(mutable.key, list(mutable), self.active_index[0]), - backward_function(mutable.key, list(mutable), self.active_index[0], self.ap_path_wb)) - else: - output = self.active_op(mutable)(x) - return output - - @property - def probs_over_ops(self): - """ - Apply softmax on alpha to generate probability distribution - - Returns - ------- - pytorch tensor - probability distribution - """ - probs = F.softmax(self.ap_path_alpha, dim=0) # softmax to probability - return probs - - @property - def chosen_index(self): - """ - choose the op with max prob - - Returns - ------- - int - index of the chosen one - numpy.float32 - prob of the chosen one - """ - probs = self.probs_over_ops.data.cpu().numpy() - index = int(np.argmax(probs)) - return index, probs[index] - - def active_op(self, mutable): - """ - assume only one path is active - - Returns - ------- - PyTorch module - the chosen operation - """ - return mutable[self.active_index[0]] - - @property - def active_op_index(self): - """ - return active op's index, the active op is sampled - - Returns - ------- - int - index of the active op - """ - return self.active_index[0] - - def set_chosen_op_active(self): - """ - set chosen index, active and inactive indexes - """ - chosen_idx, _ = self.chosen_index - self.active_index = [chosen_idx] - self.inactive_index = [_i for _i in range(0, chosen_idx)] + \ - [_i for _i in range(chosen_idx + 1, self.n_choices)] - - def binarize(self, mutable): - """ - Sample based on alpha, and set binary weights accordingly. - ap_path_wb is set in this function, which is called binarize. - - Parameters - ---------- - mutable : LayerChoice - this layer's mutable - """ - self.log_prob = None - # reset binary gates - self.ap_path_wb.data.zero_() - probs = self.probs_over_ops - if MixedOp.forward_mode == 'two': - # sample two ops according to probs - sample_op = torch.multinomial(probs.data, 2, replacement=False) - probs_slice = F.softmax(torch.stack([ - self.ap_path_alpha[idx] for idx in sample_op - ]), dim=0) - self.current_prob_over_ops = torch.zeros_like(probs) - for i, idx in enumerate(sample_op): - self.current_prob_over_ops[idx] = probs_slice[i] - # choose one to be active and the other to be inactive according to probs_slice - c = torch.multinomial(probs_slice.data, 1)[0] # 0 or 1 - active_op = sample_op[c].item() - inactive_op = sample_op[1-c].item() - self.active_index = [active_op] - self.inactive_index = [inactive_op] - # set binary gate - self.ap_path_wb.data[active_op] = 1.0 - else: - sample = torch.multinomial(probs, 1)[0].item() - self.active_index = [sample] - self.inactive_index = [_i for _i in range(0, sample)] + \ - [_i for _i in range(sample + 1, len(mutable))] - self.log_prob = torch.log(probs[sample]) - self.current_prob_over_ops = probs - self.ap_path_wb.data[sample] = 1.0 - # avoid over-regularization - for choice in mutable: - for _, param in choice.named_parameters(): - param.grad = None - - @staticmethod - def delta_ij(i, j): - if i == j: - return 1 - else: - return 0 - - def set_arch_param_grad(self, mutable): - """ - Calculate alpha gradient for this LayerChoice. - It is calculated using gradient of binary gate, probs of ops. - """ - binary_grads = self.ap_path_wb.grad.data - if self.active_op(mutable).is_zero_layer(): - self.ap_path_alpha.grad = None - return - if self.ap_path_alpha.grad is None: - self.ap_path_alpha.grad = torch.zeros_like(self.ap_path_alpha.data) - if MixedOp.forward_mode == 'two': - involved_idx = self.active_index + self.inactive_index - probs_slice = F.softmax(torch.stack([ - self.ap_path_alpha[idx] for idx in involved_idx - ]), dim=0).data - for i in range(2): - for j in range(2): - origin_i = involved_idx[i] - origin_j = involved_idx[j] - self.ap_path_alpha.grad.data[origin_i] += \ - binary_grads[origin_j] * probs_slice[j] * (MixedOp.delta_ij(i, j) - probs_slice[i]) - for _i, idx in enumerate(self.active_index): - self.active_index[_i] = (idx, self.ap_path_alpha.data[idx].item()) - for _i, idx in enumerate(self.inactive_index): - self.inactive_index[_i] = (idx, self.ap_path_alpha.data[idx].item()) - else: - probs = self.probs_over_ops.data - for i in range(self.n_choices): - for j in range(self.n_choices): - self.ap_path_alpha.grad.data[i] += binary_grads[j] * probs[j] * (MixedOp.delta_ij(i, j) - probs[i]) - return - - def rescale_updated_arch_param(self): - """ - rescale architecture weights for the 'two' mode. - """ - if not isinstance(self.active_index[0], tuple): - assert self.active_op.is_zero_layer() - return - involved_idx = [idx for idx, _ in (self.active_index + self.inactive_index)] - old_alphas = [alpha for _, alpha in (self.active_index + self.inactive_index)] - new_alphas = [self.ap_path_alpha.data[idx] for idx in involved_idx] - - offset = math.log( - sum([math.exp(alpha) for alpha in new_alphas]) / sum([math.exp(alpha) for alpha in old_alphas]) - ) - - for idx in involved_idx: - self.ap_path_alpha.data[idx] -= offset - - -class ProxylessNasMutator(BaseMutator): - """ - This mutator initializes and operates all the LayerChoices of the input model. - It is for the corresponding trainer to control the training process of LayerChoices, - coordinating with whole training process. - """ - def __init__(self, model): - """ - Init a MixedOp instance for each mutable i.e., LayerChoice. - And register the instantiated MixedOp in corresponding LayerChoice. - If does not register it in LayerChoice, DataParallel does not work then, - because architecture weights are not included in the DataParallel model. - When MixedOPs are registered, we use ```requires_grad``` to control - whether calculate gradients of architecture weights. - - Parameters - ---------- - model : pytorch model - The model that users want to tune, it includes search space defined with nni nas apis - """ - super(ProxylessNasMutator, self).__init__(model) - self._unused_modules = None - self.mutable_list = [] - for mutable in self.undedup_mutables: - self.mutable_list.append(mutable) - mutable.registered_module = MixedOp(mutable) - - def on_forward_layer_choice(self, mutable, *args, **kwargs): - """ - Callback of layer choice forward. This function defines the forward - logic of the input mutable. So mutable is only interface, its real - implementation is defined in mutator. - - Parameters - ---------- - mutable: LayerChoice - forward logic of this input mutable - args: list of torch.Tensor - inputs of this mutable - kwargs: dict - inputs of this mutable - - Returns - ------- - torch.Tensor - output of this mutable, i.e., LayerChoice - int - index of the chosen op - """ - # FIXME: return mask, to be consistent with other algorithms - idx = mutable.registered_module.active_op_index - return mutable.registered_module(mutable, *args, **kwargs), idx - - def reset_binary_gates(self): - """ - For each LayerChoice, binarize binary weights - based on alpha to only activate one op. - It traverses all the mutables in the model to do this. - """ - for mutable in self.undedup_mutables: - mutable.registered_module.binarize(mutable) - - def set_chosen_op_active(self): - """ - For each LayerChoice, set the op with highest alpha as the chosen op. - Usually used for validation. - """ - for mutable in self.undedup_mutables: - mutable.registered_module.set_chosen_op_active() - - def num_arch_params(self): - """ - The number of mutables, i.e., LayerChoice - - Returns - ------- - int - the number of LayerChoice in user model - """ - return len(self.mutable_list) - - def set_arch_param_grad(self): - """ - For each LayerChoice, calculate gradients for architecture weights, i.e., alpha - """ - for mutable in self.undedup_mutables: - mutable.registered_module.set_arch_param_grad(mutable) - - def get_architecture_parameters(self): - """ - Get all the architecture parameters. - - yield - ----- - PyTorch Parameter - Return ap_path_alpha of the traversed mutable - """ - for mutable in self.undedup_mutables: - yield mutable.registered_module.get_ap_path_alpha() - - def change_forward_mode(self, mode): - """ - Update forward mode of MixedOps, as training architecture weights and - model weights use different forward modes. - """ - MixedOp.forward_mode = mode - - def get_forward_mode(self): - """ - Get forward mode of MixedOp - - Returns - ------- - string - the current forward mode of MixedOp - """ - return MixedOp.forward_mode - - def rescale_updated_arch_param(self): - """ - Rescale architecture weights in 'two' mode. - """ - for mutable in self.undedup_mutables: - mutable.registered_module.rescale_updated_arch_param() - - def unused_modules_off(self): - """ - Remove unused modules for each mutables. - The removed modules are kept in ```self._unused_modules``` for resume later. - """ - self._unused_modules = [] - for mutable in self.undedup_mutables: - mixed_op = mutable.registered_module - unused = {} - if self.get_forward_mode() in ['full', 'two', 'full_v2']: - involved_index = mixed_op.active_index + mixed_op.inactive_index - else: - involved_index = mixed_op.active_index - for i in range(mixed_op.n_choices): - if i not in involved_index: - unused[i] = mutable[i] - mutable[i] = None - self._unused_modules.append(unused) - - def unused_modules_back(self): - """ - Resume the removed modules back. - """ - if self._unused_modules is None: - return - for m, unused in zip(self.mutable_list, self._unused_modules): - for i in unused: - m[i] = unused[i] - self._unused_modules = None - - def arch_requires_grad(self): - """ - Make architecture weights require gradient - """ - for mutable in self.undedup_mutables: - mutable.registered_module.to_requires_grad() - - def arch_disable_grad(self): - """ - Disable gradient of architecture weights, i.e., does not - calcuate gradient for them. - """ - for mutable in self.undedup_mutables: - mutable.registered_module.to_disable_grad() - - def sample_final(self): - """ - Generate the final chosen architecture. - - Returns - ------- - dict - the choice of each mutable, i.e., LayerChoice - """ - result = dict() - for mutable in self.undedup_mutables: - assert isinstance(mutable, LayerChoice) - index, _ = mutable.registered_module.chosen_index - # pylint: disable=not-callable - result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=len(mutable)).view(-1).bool() - return result diff --git a/nni/algorithms/nas/pytorch/proxylessnas/trainer.py b/nni/algorithms/nas/pytorch/proxylessnas/trainer.py deleted file mode 100644 index d9c86a6a9..000000000 --- a/nni/algorithms/nas/pytorch/proxylessnas/trainer.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import math -import time -import json -import logging - -import torch -from torch import nn as nn - -from nni.nas.pytorch.base_trainer import BaseTrainer -from nni.nas.pytorch.trainer import TorchTensorEncoder -from nni.nas.pytorch.utils import AverageMeter -from .mutator import ProxylessNasMutator -from .utils import cross_entropy_with_label_smoothing, accuracy - -logger = logging.getLogger(__name__) - -class ProxylessNasTrainer(BaseTrainer): - def __init__(self, model, model_optim, device, - train_loader, valid_loader, label_smoothing=0.1, - n_epochs=120, init_lr=0.025, binary_mode='full_v2', - arch_init_type='normal', arch_init_ratio=1e-3, - arch_optim_lr=1e-3, arch_weight_decay=0, - grad_update_arch_param_every=5, grad_update_steps=1, - warmup=True, warmup_epochs=25, - arch_valid_frequency=1, - load_ckpt=False, ckpt_path=None, arch_path=None): - """ - Parameters - ---------- - model : pytorch model - the user model, which has mutables - model_optim : pytorch optimizer - the user defined optimizer - device : pytorch device - the devices to train/search the model - train_loader : pytorch data loader - data loader for the training set - valid_loader : pytorch data loader - data loader for the validation set - label_smoothing : float - for label smoothing - n_epochs : int - number of epochs to train/search - init_lr : float - init learning rate for training the model - binary_mode : str - the forward/backward mode for the binary weights in mutator - arch_init_type : str - the way to init architecture parameters - arch_init_ratio : float - the ratio to init architecture parameters - arch_optim_lr : float - learning rate of the architecture parameters optimizer - arch_weight_decay : float - weight decay of the architecture parameters optimizer - grad_update_arch_param_every : int - update architecture weights every this number of minibatches - grad_update_steps : int - during each update of architecture weights, the number of steps to train - warmup : bool - whether to do warmup - warmup_epochs : int - the number of epochs to do during warmup - arch_valid_frequency : int - frequency of printing validation result - load_ckpt : bool - whether load checkpoint - ckpt_path : str - checkpoint path, if load_ckpt is True, ckpt_path cannot be None - arch_path : str - the path to store chosen architecture - """ - self.model = model - self.model_optim = model_optim - self.train_loader = train_loader - self.valid_loader = valid_loader - self.device = device - self.n_epochs = n_epochs - self.init_lr = init_lr - self.warmup = warmup - self.warmup_epochs = warmup_epochs - self.arch_valid_frequency = arch_valid_frequency - self.label_smoothing = label_smoothing - - self.train_batch_size = train_loader.batch_sampler.batch_size - self.valid_batch_size = valid_loader.batch_sampler.batch_size - # update architecture parameters every this number of minibatches - self.grad_update_arch_param_every = grad_update_arch_param_every - # the number of steps per architecture parameter update - self.grad_update_steps = grad_update_steps - self.binary_mode = binary_mode - - self.load_ckpt = load_ckpt - self.ckpt_path = ckpt_path - self.arch_path = arch_path - - # init mutator - self.mutator = ProxylessNasMutator(model) - - # DataParallel should be put behind the init of mutator - self.model = torch.nn.DataParallel(self.model) - self.model.to(self.device) - - # iter of valid dataset for training architecture weights - self._valid_iter = None - # init architecture weights - self._init_arch_params(arch_init_type, arch_init_ratio) - # build architecture optimizer - self.arch_optimizer = torch.optim.Adam(self.mutator.get_architecture_parameters(), - arch_optim_lr, - weight_decay=arch_weight_decay, - betas=(0, 0.999), - eps=1e-8) - - self.criterion = nn.CrossEntropyLoss() - self.warmup_curr_epoch = 0 - self.train_curr_epoch = 0 - - def _init_arch_params(self, init_type='normal', init_ratio=1e-3): - """ - Initialize architecture weights - """ - for param in self.mutator.get_architecture_parameters(): - if init_type == 'normal': - param.data.normal_(0, init_ratio) - elif init_type == 'uniform': - param.data.uniform_(-init_ratio, init_ratio) - else: - raise NotImplementedError - - def _validate(self): - """ - Do validation. During validation, LayerChoices use the chosen active op. - - Returns - ------- - float, float, float - average loss, average top1 accuracy, average top5 accuracy - """ - self.valid_loader.batch_sampler.batch_size = self.valid_batch_size - self.valid_loader.batch_sampler.drop_last = False - - self.mutator.set_chosen_op_active() - # remove unused modules to save memory - self.mutator.unused_modules_off() - # test on validation set under train mode - self.model.train() - batch_time = AverageMeter('batch_time') - losses = AverageMeter('losses') - top1 = AverageMeter('top1') - top5 = AverageMeter('top5') - end = time.time() - with torch.no_grad(): - for i, (images, labels) in enumerate(self.valid_loader): - images, labels = images.to(self.device), labels.to(self.device) - output = self.model(images) - loss = self.criterion(output, labels) - acc1, acc5 = accuracy(output, labels, topk=(1, 5)) - losses.update(loss, images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % 10 == 0 or i + 1 == len(self.valid_loader): - test_log = 'Valid' + ': [{0}/{1}]\t'\ - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\ - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'\ - 'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})'.\ - format(i, len(self.valid_loader) - 1, batch_time=batch_time, loss=losses, top1=top1) - # return top5: - test_log += '\tTop-5 acc {top5.val:.3f} ({top5.avg:.3f})'.format(top5=top5) - logger.info(test_log) - self.mutator.unused_modules_back() - return losses.avg, top1.avg, top5.avg - - def _warm_up(self): - """ - Warm up the model, during warm up, architecture weights are not trained. - """ - lr_max = 0.05 - data_loader = self.train_loader - nBatch = len(data_loader) - T_total = self.warmup_epochs * nBatch # total num of batches - - for epoch in range(self.warmup_curr_epoch, self.warmup_epochs): - logger.info('\n--------Warmup epoch: %d--------\n', epoch + 1) - batch_time = AverageMeter('batch_time') - data_time = AverageMeter('data_time') - losses = AverageMeter('losses') - top1 = AverageMeter('top1') - top5 = AverageMeter('top5') - # switch to train mode - self.model.train() - - end = time.time() - logger.info('warm_up epoch: %d', epoch) - for i, (images, labels) in enumerate(data_loader): - data_time.update(time.time() - end) - # lr - T_cur = epoch * nBatch + i - warmup_lr = 0.5 * lr_max * (1 + math.cos(math.pi * T_cur / T_total)) - for param_group in self.model_optim.param_groups: - param_group['lr'] = warmup_lr - images, labels = images.to(self.device), labels.to(self.device) - # compute output - self.mutator.reset_binary_gates() # random sample binary gates - self.mutator.unused_modules_off() # remove unused module for speedup - output = self.model(images) - if self.label_smoothing > 0: - loss = cross_entropy_with_label_smoothing(output, labels, self.label_smoothing) - else: - loss = self.criterion(output, labels) - # measure accuracy and record loss - acc1, acc5 = accuracy(output, labels, topk=(1, 5)) - losses.update(loss, images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - # compute gradient and do SGD step - self.model.zero_grad() - loss.backward() - self.model_optim.step() - # unused modules back - self.mutator.unused_modules_back() - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % 10 == 0 or i + 1 == nBatch: - batch_log = 'Warmup Train [{0}][{1}/{2}]\t' \ - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ - 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' \ - 'Loss {losses.val:.4f} ({losses.avg:.4f})\t' \ - 'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t' \ - 'Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\tlr {lr:.5f}'. \ - format(epoch + 1, i, nBatch - 1, batch_time=batch_time, data_time=data_time, - losses=losses, top1=top1, top5=top5, lr=warmup_lr) - logger.info(batch_log) - val_loss, val_top1, val_top5 = self._validate() - val_log = 'Warmup Valid [{0}/{1}]\tloss {2:.3f}\ttop-1 acc {3:.3f}\ttop-5 acc {4:.3f}\t' \ - 'Train top-1 {top1.avg:.3f}\ttop-5 {top5.avg:.3f}M'. \ - format(epoch + 1, self.warmup_epochs, val_loss, val_top1, val_top5, top1=top1, top5=top5) - logger.info(val_log) - self.save_checkpoint() - self.warmup_curr_epoch += 1 - - def _get_update_schedule(self, nBatch): - """ - Generate schedule for training architecture weights. Key means after which minibatch - to update architecture weights, value means how many steps for the update. - - Parameters - ---------- - nBatch : int - the total number of minibatches in one epoch - - Returns - ------- - dict - the schedule for updating architecture weights - """ - schedule = {} - for i in range(nBatch): - if (i + 1) % self.grad_update_arch_param_every == 0: - schedule[i] = self.grad_update_steps - return schedule - - def _calc_learning_rate(self, epoch, batch=0, nBatch=None): - """ - Update learning rate. - """ - T_total = self.n_epochs * nBatch - T_cur = epoch * nBatch + batch - lr = 0.5 * self.init_lr * (1 + math.cos(math.pi * T_cur / T_total)) - return lr - - def _adjust_learning_rate(self, optimizer, epoch, batch=0, nBatch=None): - """ - Adjust learning of a given optimizer and return the new learning rate - - Parameters - ---------- - optimizer : pytorch optimizer - the used optimizer - epoch : int - the current epoch number - batch : int - the current minibatch - nBatch : int - the total number of minibatches in one epoch - - Returns - ------- - float - the adjusted learning rate - """ - new_lr = self._calc_learning_rate(epoch, batch, nBatch) - for param_group in optimizer.param_groups: - param_group['lr'] = new_lr - return new_lr - - def _train(self): - """ - Train the model, it trains model weights and architecute weights. - Architecture weights are trained according to the schedule. - Before updating architecture weights, ```requires_grad``` is enabled. - Then, it is disabled after the updating, in order not to update - architecture weights when training model weights. - """ - nBatch = len(self.train_loader) - arch_param_num = self.mutator.num_arch_params() - binary_gates_num = self.mutator.num_arch_params() - logger.info('#arch_params: %d\t#binary_gates: %d', arch_param_num, binary_gates_num) - - update_schedule = self._get_update_schedule(nBatch) - - for epoch in range(self.train_curr_epoch, self.n_epochs): - logger.info('\n--------Train epoch: %d--------\n', epoch + 1) - batch_time = AverageMeter('batch_time') - data_time = AverageMeter('data_time') - losses = AverageMeter('losses') - top1 = AverageMeter('top1') - top5 = AverageMeter('top5') - # switch to train mode - self.model.train() - - end = time.time() - for i, (images, labels) in enumerate(self.train_loader): - data_time.update(time.time() - end) - lr = self._adjust_learning_rate(self.model_optim, epoch, batch=i, nBatch=nBatch) - # train weight parameters - images, labels = images.to(self.device), labels.to(self.device) - self.mutator.reset_binary_gates() - self.mutator.unused_modules_off() - output = self.model(images) - if self.label_smoothing > 0: - loss = cross_entropy_with_label_smoothing(output, labels, self.label_smoothing) - else: - loss = self.criterion(output, labels) - acc1, acc5 = accuracy(output, labels, topk=(1, 5)) - losses.update(loss, images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - self.model.zero_grad() - loss.backward() - self.model_optim.step() - self.mutator.unused_modules_back() - if epoch > 0: - for _ in range(update_schedule.get(i, 0)): - start_time = time.time() - # GradientArchSearchConfig - self.mutator.arch_requires_grad() - arch_loss, exp_value = self._gradient_step() - self.mutator.arch_disable_grad() - used_time = time.time() - start_time - log_str = 'Architecture [%d-%d]\t Time %.4f\t Loss %.4f\t null %s' % \ - (epoch + 1, i, used_time, arch_loss, exp_value) - logger.info(log_str) - batch_time.update(time.time() - end) - end = time.time() - # training log - if i % 10 == 0 or i + 1 == nBatch: - batch_log = 'Train [{0}][{1}/{2}]\t' \ - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ - 'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t' \ - 'Loss {losses.val:.4f} ({losses.avg:.4f})\t' \ - 'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t' \ - 'Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\tlr {lr:.5f}'. \ - format(epoch + 1, i, nBatch - 1, batch_time=batch_time, data_time=data_time, - losses=losses, top1=top1, top5=top5, lr=lr) - logger.info(batch_log) - # validate - if (epoch + 1) % self.arch_valid_frequency == 0: - val_loss, val_top1, val_top5 = self._validate() - val_log = 'Valid [{0}]\tloss {1:.3f}\ttop-1 acc {2:.3f} \ttop-5 acc {3:.3f}\t' \ - 'Train top-1 {top1.avg:.3f}\ttop-5 {top5.avg:.3f}'. \ - format(epoch + 1, val_loss, val_top1, val_top5, top1=top1, top5=top5) - logger.info(val_log) - self.save_checkpoint() - self.train_curr_epoch += 1 - - def _valid_next_batch(self): - """ - Get next one minibatch from validation set - - Returns - ------- - (tensor, tensor) - the tuple of images and labels - """ - if self._valid_iter is None: - self._valid_iter = iter(self.valid_loader) - try: - data = next(self._valid_iter) - except StopIteration: - self._valid_iter = iter(self.valid_loader) - data = next(self._valid_iter) - return data - - def _gradient_step(self): - """ - This gradient step is for updating architecture weights. - Mutator is intensively used in this function to operate on - architecture weights. - - Returns - ------- - float, None - loss of the model, None - """ - # use the same batch size as train batch size for architecture weights - self.valid_loader.batch_sampler.batch_size = self.train_batch_size - self.valid_loader.batch_sampler.drop_last = True - self.model.train() - self.mutator.change_forward_mode(self.binary_mode) - time1 = time.time() # time - # sample a batch of data from validation set - images, labels = self._valid_next_batch() - images, labels = images.to(self.device), labels.to(self.device) - time2 = time.time() # time - self.mutator.reset_binary_gates() - self.mutator.unused_modules_off() - output = self.model(images) - time3 = time.time() - ce_loss = self.criterion(output, labels) - expected_value = None - loss = ce_loss - self.model.zero_grad() - loss.backward() - self.mutator.set_arch_param_grad() - self.arch_optimizer.step() - if self.mutator.get_forward_mode() == 'two': - self.mutator.rescale_updated_arch_param() - self.mutator.unused_modules_back() - self.mutator.change_forward_mode(None) - time4 = time.time() - logger.info('(%.4f, %.4f, %.4f)', time2 - time1, time3 - time2, time4 - time3) - return loss.data.item(), expected_value.item() if expected_value is not None else None - - def save_checkpoint(self): - """ - Save checkpoint of the whole model. Saving model weights and architecture weights in - ```ckpt_path```, and saving currently chosen architecture in ```arch_path```. - """ - if self.ckpt_path: - state = { - 'warmup_curr_epoch': self.warmup_curr_epoch, - 'train_curr_epoch': self.train_curr_epoch, - 'model': self.model.state_dict(), - 'optim': self.model_optim.state_dict(), - 'arch_optim': self.arch_optimizer.state_dict() - } - torch.save(state, self.ckpt_path) - if self.arch_path: - self.export(self.arch_path) - - def load_checkpoint(self): - """ - Load the checkpoint from ```ckpt_path```. - """ - assert self.ckpt_path is not None, "If load_ckpt is not None, ckpt_path should not be None" - ckpt = torch.load(self.ckpt_path) - self.warmup_curr_epoch = ckpt['warmup_curr_epoch'] - self.train_curr_epoch = ckpt['train_curr_epoch'] - self.model.load_state_dict(ckpt['model']) - self.model_optim.load_state_dict(ckpt['optim']) - self.arch_optimizer.load_state_dict(ckpt['arch_optim']) - - def train(self): - """ - Train the whole model. - """ - if self.load_ckpt: - self.load_checkpoint() - if self.warmup: - self._warm_up() - self._train() - - def export(self, file_name): - """ - Export the chosen architecture into a file - - Parameters - ---------- - file_name : str - the file that stores exported chosen architecture - """ - exported_arch = self.mutator.sample_final() - with open(file_name, 'w') as f: - json.dump(exported_arch, f, indent=2, sort_keys=True, cls=TorchTensorEncoder) - - def validate(self): - raise NotImplementedError - - def checkpoint(self): - raise NotImplementedError diff --git a/nni/algorithms/nas/pytorch/proxylessnas/utils.py b/nni/algorithms/nas/pytorch/proxylessnas/utils.py deleted file mode 100644 index c532efc04..000000000 --- a/nni/algorithms/nas/pytorch/proxylessnas/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn - -def detach_variable(inputs): - """ - Detach variables - - Parameters - ---------- - inputs : pytorch tensors - pytorch tensors - """ - if isinstance(inputs, tuple): - return tuple([detach_variable(x) for x in inputs]) - else: - x = inputs.detach() - x.requires_grad = inputs.requires_grad - return x - -def cross_entropy_with_label_smoothing(pred, target, label_smoothing=0.1): - """ - Parameters - ---------- - pred : pytorch tensor - predicted value - target : pytorch tensor - label - label_smoothing : float - the degree of label smoothing - - Returns - ------- - pytorch tensor - cross entropy - """ - logsoftmax = nn.LogSoftmax() - n_classes = pred.size(1) - # convert to one-hot - target = torch.unsqueeze(target, 1) - soft_target = torch.zeros_like(pred) - soft_target.scatter_(1, target, 1) - # label smoothing - soft_target = soft_target * (1 - label_smoothing) + label_smoothing / n_classes - return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1)) - -def accuracy(output, target, topk=(1,)): - """ - Computes the precision@k for the specified values of k - - Parameters - ---------- - output : pytorch tensor - output, e.g., predicted value - target : pytorch tensor - label - topk : tuple - specify top1 and top5 - - Returns - ------- - list - accuracy of top1 and top5 - """ - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res diff --git a/nni/algorithms/nas/pytorch/random/__init__.py b/nni/algorithms/nas/pytorch/random/__init__.py deleted file mode 100644 index 0ff4a7795..000000000 --- a/nni/algorithms/nas/pytorch/random/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import RandomMutator diff --git a/nni/algorithms/nas/pytorch/random/mutator.py b/nni/algorithms/nas/pytorch/random/mutator.py deleted file mode 100644 index 22ecc0831..000000000 --- a/nni/algorithms/nas/pytorch/random/mutator.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn.functional as F - -from nni.nas.pytorch.mutator import Mutator -from nni.nas.pytorch.mutables import LayerChoice, InputChoice - - -class RandomMutator(Mutator): - """ - Random mutator that samples a random candidate in the search space each time ``reset()``. - It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior. - """ - - def sample_search(self): - """ - Sample a random candidate. - """ - result = dict() - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - gen_index = torch.randint(high=len(mutable), size=(1, )) - result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool() - elif isinstance(mutable, InputChoice): - if mutable.n_chosen is None: - result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool() - else: - perm = torch.randperm(mutable.n_candidates) - mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)] - result[mutable.key] = torch.tensor(mask, dtype=torch.bool) # pylint: disable=not-callable - return result - - def sample_final(self): - """ - Same as :meth:`sample_search`. - """ - return self.sample_search() diff --git a/nni/algorithms/nas/pytorch/spos/__init__.py b/nni/algorithms/nas/pytorch/spos/__init__.py deleted file mode 100644 index ed432b084..000000000 --- a/nni/algorithms/nas/pytorch/spos/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .evolution import SPOSEvolution -from .mutator import SPOSSupernetTrainingMutator -from .trainer import SPOSSupernetTrainer diff --git a/nni/algorithms/nas/pytorch/spos/evolution.py b/nni/algorithms/nas/pytorch/spos/evolution.py deleted file mode 100644 index bd099e276..000000000 --- a/nni/algorithms/nas/pytorch/spos/evolution.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging -import os -import re -from collections import deque - -import numpy as np -from nni.tuner import Tuner -from nni.algorithms.nas.pytorch.classic_nas.mutator import LAYER_CHOICE, INPUT_CHOICE - - -_logger = logging.getLogger(__name__) - - -class SPOSEvolution(Tuner): - """ - SPOS evolution tuner. - - Parameters - ---------- - max_epochs : int - Maximum number of epochs to run. - num_select : int - Number of survival candidates of each epoch. - num_population : int - Number of candidates at the start of each epoch. If candidates generated by - crossover and mutation are not enough, the rest will be filled with random - candidates. - m_prob : float - The probability of mutation. - num_crossover : int - Number of candidates generated by crossover in each epoch. - num_mutation : int - Number of candidates generated by mutation in each epoch. - """ - - def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, - num_crossover=25, num_mutation=25): - assert num_population >= num_select - self.max_epochs = max_epochs - self.num_select = num_select - self.num_population = num_population - self.m_prob = m_prob - self.num_crossover = num_crossover - self.num_mutation = num_mutation - self.epoch = 0 - self.candidates = [] - self.search_space = None - self.random_state = np.random.RandomState(0) - - # async status - self._to_evaluate_queue = deque() - self._sending_parameter_queue = deque() - self._pending_result_ids = set() - self._reward_dict = dict() - self._id2candidate = dict() - self._st_callback = None - - def update_search_space(self, search_space): - """ - Handle the initialization/update event of search space. - """ - self._search_space = search_space - self._next_round() - - def _next_round(self): - _logger.info("Epoch %d, generating...", self.epoch) - if self.epoch == 0: - self._get_random_population() - self.export_results(self.candidates) - else: - best_candidates = self._select_top_candidates() - self.export_results(best_candidates) - if self.epoch >= self.max_epochs: - return - self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates) - self._get_random_population() - self.epoch += 1 - - def _random_candidate(self): - chosen_arch = dict() - for key, val in self._search_space.items(): - if val["_type"] == LAYER_CHOICE: - choices = val["_value"] - index = self.random_state.randint(len(choices)) - chosen_arch[key] = {"_value": choices[index], "_idx": index} - elif val["_type"] == INPUT_CHOICE: - raise NotImplementedError("Input choice is not implemented yet.") - return chosen_arch - - def _add_to_evaluate_queue(self, cand): - _logger.info("Generate candidate %s, adding to eval queue.", self._get_architecture_repr(cand)) - self._reward_dict[self._hashcode(cand)] = 0. - self._to_evaluate_queue.append(cand) - - def _get_random_population(self): - while len(self.candidates) < self.num_population: - cand = self._random_candidate() - if self._is_legal(cand): - _logger.info("Random candidate generated.") - self._add_to_evaluate_queue(cand) - self.candidates.append(cand) - - def _get_crossover(self, best): - result = [] - for _ in range(10 * self.num_crossover): - cand_p1 = best[self.random_state.randint(len(best))] - cand_p2 = best[self.random_state.randint(len(best))] - assert cand_p1.keys() == cand_p2.keys() - cand = {k: cand_p1[k] if self.random_state.randint(2) == 0 else cand_p2[k] - for k in cand_p1.keys()} - if self._is_legal(cand): - result.append(cand) - self._add_to_evaluate_queue(cand) - if len(result) >= self.num_crossover: - break - _logger.info("Found %d architectures with crossover.", len(result)) - return result - - def _get_mutation(self, best): - result = [] - for _ in range(10 * self.num_mutation): - cand = best[self.random_state.randint(len(best))].copy() - mutation_sample = np.random.random_sample(len(cand)) - for s, k in zip(mutation_sample, cand): - if s < self.m_prob: - choices = self._search_space[k]["_value"] - index = self.random_state.randint(len(choices)) - cand[k] = {"_value": choices[index], "_idx": index} - if self._is_legal(cand): - result.append(cand) - self._add_to_evaluate_queue(cand) - if len(result) >= self.num_mutation: - break - _logger.info("Found %d architectures with mutation.", len(result)) - return result - - def _get_architecture_repr(self, cand): - return re.sub(r"\".*?\": \{\"_idx\": (\d+), \"_value\": \".*?\"\}", r"\1", - self._hashcode(cand)) - - def _is_legal(self, cand): - if self._hashcode(cand) in self._reward_dict: - return False - return True - - def _select_top_candidates(self): - reward_query = lambda cand: self._reward_dict[self._hashcode(cand)] - _logger.info("All candidate rewards: %s", list(map(reward_query, self.candidates))) - result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select] - _logger.info("Best candidate rewards: %s", list(map(reward_query, result))) - return result - - @staticmethod - def _hashcode(d): - return json.dumps(d, sort_keys=True) - - def _bind_and_send_parameters(self): - """ - There are two types of resources: parameter ids and candidates. This function is called at - necessary times to bind these resources to send new trials with st_callback. - """ - result = [] - while self._sending_parameter_queue and self._to_evaluate_queue: - parameter_id = self._sending_parameter_queue.popleft() - parameters = self._to_evaluate_queue.popleft() - self._id2candidate[parameter_id] = parameters - result.append(parameters) - self._pending_result_ids.add(parameter_id) - self._st_callback(parameter_id, parameters) - _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters)) - return result - - def generate_multiple_parameters(self, parameter_id_list, **kwargs): - """ - Callback function necessary to implement a tuner. This will put more parameter ids into the - parameter id queue. - """ - if "st_callback" in kwargs and self._st_callback is None: - self._st_callback = kwargs["st_callback"] - for parameter_id in parameter_id_list: - self._sending_parameter_queue.append(parameter_id) - self._bind_and_send_parameters() - return [] # always not use this. might induce problem of over-sending - - def receive_trial_result(self, parameter_id, parameters, value, **kwargs): - """ - Callback function. Receive a trial result. - """ - _logger.info("Candidate %d, reported reward %f", parameter_id, value) - self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value - - def trial_end(self, parameter_id, success, **kwargs): - """ - Callback function when a trial is ended and resource is released. - """ - self._pending_result_ids.remove(parameter_id) - if not self._pending_result_ids and not self._to_evaluate_queue: - # a new epoch now - self._next_round() - assert self._st_callback is not None - self._bind_and_send_parameters() - - def export_results(self, result): - """ - Export a number of candidates to `checkpoints` dir. - - Parameters - ---------- - result : dict - Chosen architectures to be exported. - """ - os.makedirs("checkpoints", exist_ok=True) - for i, cand in enumerate(result): - converted = dict() - for cand_key, cand_val in cand.items(): - onehot = [k == cand_val["_idx"] for k in range(len(self._search_space[cand_key]["_value"]))] - converted[cand_key] = onehot - with open(os.path.join("checkpoints", "%03d_%03d.json" % (self.epoch, i)), "w") as fp: - json.dump(converted, fp) diff --git a/nni/algorithms/nas/pytorch/spos/mutator.py b/nni/algorithms/nas/pytorch/spos/mutator.py deleted file mode 100644 index 1a803cb2e..000000000 --- a/nni/algorithms/nas/pytorch/spos/mutator.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging - -import numpy as np -from nni.algorithms.nas.pytorch.random import RandomMutator - -_logger = logging.getLogger(__name__) - - -class SPOSSupernetTrainingMutator(RandomMutator): - """ - A random mutator with flops limit. - - Parameters - ---------- - model : nn.Module - PyTorch model. - flops_func : callable - Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func` - is None, functions related to flops will be deactivated. - flops_lb : number - Lower bound of flops. - flops_ub : number - Upper bound of flops. - flops_bin_num : number - Number of bins divided for the interval of flops to ensure the uniformity. Bigger number will be more - uniform, but the sampling will be slower. - flops_sample_timeout : int - Maximum number of attempts to sample before giving up and use a random candidate. - """ - def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None, - flops_bin_num=7, flops_sample_timeout=500): - - super().__init__(model) - self._flops_func = flops_func - if self._flops_func is not None: - self._flops_bin_num = flops_bin_num - self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num + 1)] - self._flops_sample_timeout = flops_sample_timeout - - def sample_search(self): - """ - Sample a candidate for training. When `flops_func` is not None, candidates will be sampled uniformly - relative to flops. - - Returns - ------- - dict - """ - if self._flops_func is not None: - for times in range(self._flops_sample_timeout): - idx = np.random.randint(self._flops_bin_num) - cand = super().sample_search() - if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]: - _logger.debug("Sampled candidate flops %f in %d times.", cand, times) - return cand - _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout) - return super().sample_search() - - def sample_final(self): - """ - Implement only to suffice the interface of Mutator. - """ - return self.sample_search() diff --git a/nni/algorithms/nas/pytorch/spos/trainer.py b/nni/algorithms/nas/pytorch/spos/trainer.py deleted file mode 100644 index 7c954e2ad..000000000 --- a/nni/algorithms/nas/pytorch/spos/trainer.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging - -import torch -from nni.nas.pytorch.trainer import Trainer -from nni.nas.pytorch.utils import AverageMeterGroup - -from .mutator import SPOSSupernetTrainingMutator - -logger = logging.getLogger(__name__) - - -class SPOSSupernetTrainer(Trainer): - """ - This trainer trains a supernet that can be used for evolution search. - - Parameters - ---------- - model : nn.Module - Model with mutables. - mutator : nni.nas.pytorch.mutator.Mutator - A mutator object that has been initialized with the model. - loss : callable - Called with logits and targets. Returns a loss tensor. - metrics : callable - Returns a dict that maps metrics keys to metrics data. - optimizer : Optimizer - Optimizer that optimizes the model. - num_epochs : int - Number of epochs of training. - train_loader : iterable - Data loader of training. Raise ``StopIteration`` when one epoch is exhausted. - dataset_valid : iterable - Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted. - batch_size : int - Batch size. - workers: int - Number of threads for data preprocessing. Not used for this trainer. Maybe removed in future. - device : torch.device - Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will - automatic detects GPU and selects GPU first. - log_frequency : int - Number of mini-batches to log metrics. - callbacks : list of Callback - Callbacks to plug into the trainer. See Callbacks. - """ - - def __init__(self, model, loss, metrics, - optimizer, num_epochs, train_loader, valid_loader, - mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, - callbacks=None): - assert torch.cuda.is_available() - super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model), - loss, metrics, optimizer, num_epochs, None, None, - batch_size, workers, device, log_frequency, callbacks) - - self.train_loader = train_loader - self.valid_loader = valid_loader - - def train_one_epoch(self, epoch): - self.model.train() - meters = AverageMeterGroup() - for step, (x, y) in enumerate(self.train_loader): - x, y = x.to(self.device), y.to(self.device) - self.optimizer.zero_grad() - self.mutator.reset() - logits = self.model(x) - loss = self.loss(logits, y) - loss.backward() - self.optimizer.step() - - metrics = self.metrics(logits, y) - metrics["loss"] = loss.item() - meters.update(metrics) - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, - self.num_epochs, step + 1, len(self.train_loader), meters) - - def validate_one_epoch(self, epoch): - self.model.eval() - meters = AverageMeterGroup() - with torch.no_grad(): - for step, (x, y) in enumerate(self.valid_loader): - x, y = x.to(self.device), y.to(self.device) - self.mutator.reset() - logits = self.model(x) - loss = self.loss(logits, y) - metrics = self.metrics(logits, y) - metrics["loss"] = loss.item() - meters.update(metrics) - if self.log_frequency is not None and step % self.log_frequency == 0: - logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, - self.num_epochs, step + 1, len(self.valid_loader), meters) diff --git a/nni/algorithms/nas/tensorflow/__init__.py b/nni/algorithms/nas/tensorflow/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py b/nni/algorithms/nas/tensorflow/classic_nas/__init__.py deleted file mode 100644 index ec3f5a489..000000000 --- a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import get_and_apply_next_architecture diff --git a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py b/nni/algorithms/nas/tensorflow/classic_nas/mutator.py deleted file mode 100644 index cb089c49b..000000000 --- a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# pylint: skip-file - -import json -import logging -import os -import sys - -import tensorflow as tf - -import nni -from nni.runtime.env_vars import trial_env_vars -from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope -from nni.nas.tensorflow.mutator import Mutator - -logger = logging.getLogger(__name__) - -NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE" -LAYER_CHOICE = "layer_choice" -INPUT_CHOICE = "input_choice" - - -def get_and_apply_next_architecture(model): - """ - Wrapper of :class:`~nni.nas.tensorflow.classic_nas.mutator.ClassicMutator` to make it more meaningful, - similar to ``get_next_parameter`` for HPO. - Tt will generate search space based on ``model``. - If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for - generating search space for the experiment. - If not, there are still two mode, one is nni experiment mode where users - use ``nnictl`` to start an experiment. The other is standalone mode - where users directly run the trial command, this mode chooses the first - one(s) for each LayerChoice and InputChoice. - Parameters - ---------- - model : nn.Module - User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. - """ - ClassicMutator(model) - - -class ClassicMutator(Mutator): - """ - This mutator is to apply the architecture chosen from tuner. - It implements the forward function of LayerChoice and InputChoice, - to only activate the chosen ones. - Parameters - ---------- - model : nn.Module - User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. - """ - - def __init__(self, model): - super(ClassicMutator, self).__init__(model) - self._chosen_arch = {} - self._search_space = self._generate_search_space() - if NNI_GEN_SEARCH_SPACE in os.environ: - # dry run for only generating search space - self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE]) - sys.exit(0) - - if trial_env_vars.NNI_PLATFORM is None: - logger.warning("This is in standalone mode, the chosen are the first one(s).") - self._chosen_arch = self._standalone_generate_chosen() - else: - # get chosen arch from tuner - self._chosen_arch = nni.get_next_parameter() - if self._chosen_arch is None: - if trial_env_vars.NNI_PLATFORM == "unittest": - # happens if NNI_PLATFORM is intentionally set, e.g., in UT - logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.") - self._chosen_arch = self._standalone_generate_chosen() - else: - raise RuntimeError("Chosen architecture is None. This may be a platform error.") - self.reset() - - def _sample_layer_choice(self, mutable, idx, value, search_space_item): - """ - Convert layer choice to tensor representation. - Parameters - ---------- - mutable : Mutable - idx : int - Number `idx` of list will be selected. - value : str - The verbose representation of the selected value. - search_space_item : list - The list for corresponding search space. - """ - # doesn't support multihot for layer choice yet - assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \ - "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value) - mask = tf.one_hot(idx, len(mutable)) - return tf.cast(tf.reshape(mask, [-1]), tf.bool) - - def _sample_input_choice(self, mutable, idx, value, search_space_item): - """ - Convert input choice to tensor representation. - Parameters - ---------- - mutable : Mutable - idx : int - Number `idx` of list will be selected. - value : str - The verbose representation of the selected value. - search_space_item : list - The list for corresponding search space. - """ - candidate_repr = search_space_item["candidates"] - multihot_list = [False] * mutable.n_candidates - for i, v in zip(idx, value): - assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \ - "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v) - assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx) - multihot_list[i] = True - return tf.cast(multihot_list, tf.bool) # pylint: disable=not-callable - - def sample_search(self): - """ - See :meth:`sample_final`. - """ - return self.sample_final() - - def sample_final(self): - """ - Convert the chosen arch and apply it on model. - """ - assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \ - "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(), - self._chosen_arch.keys()) - result = dict() - for mutable in self.mutables: - if isinstance(mutable, (LayerChoice, InputChoice)): - assert mutable.key in self._chosen_arch, \ - "Expected '{}' in chosen arch, but not found.".format(mutable.key) - data = self._chosen_arch[mutable.key] - assert isinstance(data, dict) and "_value" in data and "_idx" in data, \ - "'{}' is not a valid choice.".format(data) - if isinstance(mutable, LayerChoice): - result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"], - self._search_space[mutable.key]["_value"]) - elif isinstance(mutable, InputChoice): - result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"], - self._search_space[mutable.key]["_value"]) - elif isinstance(mutable, MutableScope): - logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key) - else: - raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) - return result - - def _standalone_generate_chosen(self): - """ - Generate the chosen architecture for standalone mode, - i.e., choose the first one(s) for LayerChoice and InputChoice. - :: - { key_name: {"_value": "conv1", - "_idx": 0} } - { key_name: {"_value": ["in1"], - "_idx": [0]} } - Returns - ------- - dict - the chosen architecture - """ - chosen_arch = {} - for key, val in self._search_space.items(): - if val["_type"] == LAYER_CHOICE: - choices = val["_value"] - chosen_arch[key] = {"_value": choices[0], "_idx": 0} - elif val["_type"] == INPUT_CHOICE: - choices = val["_value"]["candidates"] - n_chosen = val["_value"]["n_chosen"] - if n_chosen is None: - n_chosen = len(choices) - chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))} - else: - raise ValueError("Unknown key '%s' and value '%s'." % (key, val)) - return chosen_arch - - def _generate_search_space(self): - """ - Generate search space from mutables. - Here is the search space format: - :: - { key_name: {"_type": "layer_choice", - "_value": ["conv1", "conv2"]} } - { key_name: {"_type": "input_choice", - "_value": {"candidates": ["in1", "in2"], - "n_chosen": 1}} } - Returns - ------- - dict - the generated search space - """ - search_space = {} - for mutable in self.mutables: - # for now we only generate flattened search space - if isinstance(mutable, LayerChoice): - key = mutable.key - val = mutable.names - search_space[key] = {"_type": LAYER_CHOICE, "_value": val} - elif isinstance(mutable, InputChoice): - key = mutable.key - search_space[key] = {"_type": INPUT_CHOICE, - "_value": {"candidates": mutable.choose_from, - "n_chosen": mutable.n_chosen}} - elif isinstance(mutable, MutableScope): - logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key) - else: - raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) - return search_space - - def _dump_search_space(self, file_path): - with open(file_path, "w") as ss_file: - json.dump(self._search_space, ss_file, sort_keys=True, indent=2) diff --git a/nni/algorithms/nas/tensorflow/enas/__init__.py b/nni/algorithms/nas/tensorflow/enas/__init__.py deleted file mode 100644 index d3372836e..000000000 --- a/nni/algorithms/nas/tensorflow/enas/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .mutator import EnasMutator -from .trainer import EnasTrainer diff --git a/nni/algorithms/nas/tensorflow/enas/mutator.py b/nni/algorithms/nas/tensorflow/enas/mutator.py deleted file mode 100644 index 313c81cc9..000000000 --- a/nni/algorithms/nas/tensorflow/enas/mutator.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# pylint: skip-file - -import tensorflow as tf -from tensorflow.keras.layers import Dense, Embedding, LSTMCell, RNN -from tensorflow.keras.losses import SparseCategoricalCrossentropy, Reduction - -from nni.nas.tensorflow.mutator import Mutator -from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope - - -class EnasMutator(Mutator): - def __init__(self, model, - lstm_size=64, - lstm_num_layers=1, - tanh_constant=1.5, - cell_exit_extra_step=False, - skip_target=0.4, - temperature=None, - branch_bias=0.25, - entropy_reduction='sum'): - super().__init__(model) - self.tanh_constant = tanh_constant - self.temperature = temperature - self.cell_exit_extra_step = cell_exit_extra_step - - cells = [LSTMCell(units=lstm_size, use_bias=False) for _ in range(lstm_num_layers)] - self.lstm = RNN(cells, stateful=True) - self.g_emb = tf.random.normal((1, 1, lstm_size)) * 0.1 - self.skip_targets = tf.constant([1.0 - skip_target, skip_target]) - - self.max_layer_choice = 0 - self.bias_dict = {} - for mutable in self.mutables: - if isinstance(mutable, LayerChoice): - if self.max_layer_choice == 0: - self.max_layer_choice = len(mutable) - assert self.max_layer_choice == len(mutable), \ - "ENAS mutator requires all layer choice have the same number of candidates." - if 'reduce' in mutable.key: - bias = [] - for choice in mutable.choices: - if 'conv' in str(type(choice)).lower(): - bias.append(branch_bias) - else: - bias.append(-branch_bias) - self.bias_dict[mutable.key] = tf.constant(bias) - - # exposed for trainer - self.sample_log_prob = 0 - self.sample_entropy = 0 - self.sample_skip_penalty = 0 - - # internal nn layers - self.embedding = Embedding(self.max_layer_choice + 1, lstm_size) - self.soft = Dense(self.max_layer_choice, use_bias=False) - self.attn_anchor = Dense(lstm_size, use_bias=False) - self.attn_query = Dense(lstm_size, use_bias=False) - self.v_attn = Dense(1, use_bias=False) - assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.' - self.entropy_reduction = tf.reduce_sum if entropy_reduction == 'sum' else tf.reduce_mean - self.cross_entropy_loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE) - - self._first_sample = True - - def sample_search(self): - self._initialize() - self._sample(self.mutables) - self._first_sample = False - return self._choices - - def sample_final(self): - return self.sample_search() - - def _sample(self, tree): - mutable = tree.mutable - if isinstance(mutable, LayerChoice) and mutable.key not in self._choices: - self._choices[mutable.key] = self._sample_layer_choice(mutable) - elif isinstance(mutable, InputChoice) and mutable.key not in self._choices: - self._choices[mutable.key] = self._sample_input_choice(mutable) - for child in tree.children: - self._sample(child) - if self.cell_exit_extra_step and isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid: - self._anchors_hid[mutable.key] = self.lstm(self._inputs, 1) - - def _initialize(self): - self._choices = {} - self._anchors_hid = {} - self._inputs = self.g_emb - # seems the `input_shape` parameter of RNN does not work - # workaround it by omitting `reset_states` for first run - if not self._first_sample: - self.lstm.reset_states() - self.sample_log_prob = 0 - self.sample_entropy = 0 - self.sample_skip_penalty = 0 - - def _sample_layer_choice(self, mutable): - logit = self.soft(self.lstm(self._inputs)) - if self.temperature is not None: - logit /= self.temperature - if self.tanh_constant is not None: - logit = self.tanh_constant * tf.tanh(logit) - if mutable.key in self.bias_dict: - logit += self.bias_dict[mutable.key] - softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) - branch_id = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [1]) - log_prob = self.cross_entropy_loss(branch_id, logit) - self.sample_log_prob += self.entropy_reduction(log_prob) - entropy = log_prob * tf.math.exp(-log_prob) - self.sample_entropy += self.entropy_reduction(entropy) - self._inputs = tf.reshape(self.embedding(branch_id), [1, 1, -1]) - mask = tf.one_hot(branch_id, self.max_layer_choice) - return tf.cast(tf.reshape(mask, [-1]), tf.bool) - - def _sample_input_choice(self, mutable): - query, anchors = [], [] - for label in mutable.choose_from: - if label not in self._anchors_hid: - self._anchors_hid[label] = self.lstm(self._inputs) - query.append(self.attn_anchor(self._anchors_hid[label])) - anchors.append(self._anchors_hid[label]) - query = tf.concat(query, axis=0) - query = tf.tanh(query + self.attn_query(anchors[-1])) - query = self.v_attn(query) - - if self.temperature is not None: - query /= self.temperature - if self.tanh_constant is not None: - query = self.tanh_constant * tf.tanh(query) - - if mutable.n_chosen is None: - logit = tf.concat([-query, query], axis=1) - softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) - skip = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1]) - skip_prob = tf.math.sigmoid(logit) - kl = tf.reduce_sum(skip_prob * tf.math.log(skip_prob / self.skip_targets)) - self.sample_skip_penalty += kl - log_prob = self.cross_entropy_loss(skip, logit) - - skip = tf.cast(skip, tf.float32) - inputs = tf.tensordot(skip, tf.concat(anchors, 0), 1) / (1. + tf.reduce_sum(skip)) - self._inputs = tf.reshape(inputs, [1, 1, -1]) - - else: - assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS." - logit = tf.reshape(query, [1, -1]) - softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) - index = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1]) - skip = tf.reshape(tf.one_hot(index, mutable.n_candidates), [-1]) - # when the size is 1, tf does not accept tensor here, complaining the shape is wrong - # but using a numpy array seems fine - log_prob = self.cross_entropy_loss(logit, query.numpy()) - self._inputs = tf.reshape(anchors[index.numpy()[0]], [1, 1, -1]) - - self.sample_log_prob += self.entropy_reduction(log_prob) - entropy = log_prob * tf.exp(-log_prob) - self.sample_entropy += self.entropy_reduction(entropy) - assert len(skip) == mutable.n_candidates, (skip, mutable.n_candidates, mutable.n_chosen) - return tf.cast(skip, tf.bool) diff --git a/nni/algorithms/nas/tensorflow/enas/trainer.py b/nni/algorithms/nas/tensorflow/enas/trainer.py deleted file mode 100644 index 67df9c7f9..000000000 --- a/nni/algorithms/nas/tensorflow/enas/trainer.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# pylint: skip-file - -import logging - -import tensorflow as tf -from tensorflow.keras.optimizers import Adam - -from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads - -from .mutator import EnasMutator - -logger = logging.getLogger(__name__) - - -class EnasTrainer: - def __init__( - self, - model, - loss, - metrics, - reward_function, - optimizer, - batch_size, - num_epochs, - dataset_train, - dataset_valid, - log_frequency=100, - entropy_weight=0.0001, - skip_weight=0.8, - baseline_decay=0.999, - child_steps=500, - mutator_lr=0.00035, - mutator_steps=50, - mutator_steps_aggregate=20, - aux_weight=0.4, - test_arc_per_epoch=1, - ): - self.model = model - self.loss = loss - self.metrics = metrics - self.reward_function = reward_function - self.optimizer = optimizer - self.batch_size = batch_size - self.num_epochs = num_epochs - - x, y = dataset_train - split = int(len(x) * 0.9) - self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split])) - self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:])) - self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid) - - self.log_frequency = log_frequency - self.entropy_weight = entropy_weight - self.skip_weight = skip_weight - self.baseline_decay = baseline_decay - self.child_steps = child_steps - self.mutator_lr = mutator_lr - self.mutator_steps = mutator_steps - self.mutator_steps_aggregate = mutator_steps_aggregate - self.aux_weight = aux_weight - self.test_arc_per_epoch = test_arc_per_epoch - - self.mutator = EnasMutator(model) - self.mutator_optim = Adam(learning_rate=self.mutator_lr) - - self.baseline = 0.0 - - def train(self, validate=True): - for epoch in range(self.num_epochs): - logger.info("Epoch %d Training", epoch + 1) - self.train_one_epoch(epoch) - logger.info("Epoch %d Validating", epoch + 1) - self.validate_one_epoch(epoch) - - def validate(self): - self.validate_one_epoch(-1) - - def train_one_epoch(self, epoch): - train_loader, valid_loader = self._create_train_loader() - - # Sample model and train - meters = AverageMeterGroup() - - for step in range(1, self.child_steps + 1): - x, y = next(train_loader) - self.mutator.reset() - - with tf.GradientTape() as tape: - logits = self.model(x, training=True) - if isinstance(logits, tuple): - logits, aux_logits = logits - aux_loss = self.loss(aux_logits, y) - else: - aux_loss = 0.0 - metrics = self.metrics(y, logits) - loss = self.loss(y, logits) + self.aux_weight * aux_loss - - grads = tape.gradient(loss, self.model.trainable_weights) - grads = fill_zero_grads(grads, self.model.trainable_weights) - grads, _ = tf.clip_by_global_norm(grads, 5.0) - self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) - - metrics["loss"] = tf.reduce_mean(loss).numpy() - meters.update(metrics) - - if self.log_frequency and step % self.log_frequency == 0: - logger.info( - "Model Epoch [%d/%d] Step [%d/%d] %s", - epoch + 1, - self.num_epochs, - step, - self.child_steps, - meters, - ) - - # Train sampler (mutator) - meters = AverageMeterGroup() - for mutator_step in range(1, self.mutator_steps + 1): - grads_list = [] - for step in range(1, self.mutator_steps_aggregate + 1): - with tf.GradientTape() as tape: - x, y = next(valid_loader) - self.mutator.reset() - - logits = self.model(x, training=False) - metrics = self.metrics(y, logits) - reward = ( - self.reward_function(y, logits) - + self.entropy_weight * self.mutator.sample_entropy - ) - self.baseline = self.baseline * self.baseline_decay + reward * ( - 1 - self.baseline_decay - ) - loss = self.mutator.sample_log_prob * (reward - self.baseline) - loss += self.skip_weight * self.mutator.sample_skip_penalty - - meters.update( - { - "reward": reward, - "loss": tf.reduce_mean(loss).numpy(), - "ent": self.mutator.sample_entropy.numpy(), - "log_prob": self.mutator.sample_log_prob.numpy(), - "baseline": self.baseline, - "skip": self.mutator.sample_skip_penalty, - } - ) - - cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate - if self.log_frequency and cur_step % self.log_frequency == 0: - logger.info( - "RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", - epoch + 1, - self.num_epochs, - mutator_step, - self.mutator_steps, - step, - self.mutator_steps_aggregate, - meters, - ) - - grads = tape.gradient(loss, self.mutator.trainable_weights) - grads = fill_zero_grads(grads, self.mutator.trainable_weights) - grads_list.append(grads) - total_grads = [ - tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list) - ] - total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0) - self.mutator_optim.apply_gradients( - zip(total_grads, self.mutator.trainable_weights) - ) - - def validate_one_epoch(self, epoch): - test_loader = self._create_validate_loader() - - for arc_id in range(self.test_arc_per_epoch): - meters = AverageMeterGroup() - for x, y in test_loader: - self.mutator.reset() - logits = self.model(x, training=False) - if isinstance(logits, tuple): - logits, _ = logits - metrics = self.metrics(y, logits) - loss = self.loss(y, logits) - metrics["loss"] = tf.reduce_mean(loss).numpy() - meters.update(metrics) - - logger.info( - "Test Epoch [%d/%d] Arc [%d/%d] Summary %s", - epoch + 1, - self.num_epochs, - arc_id + 1, - self.test_arc_per_epoch, - meters.summary(), - ) - - def _create_train_loader(self): - train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size) - test_set = self.valid_set.shuffle(1000000).repeat().batch(self.batch_size) - return iter(train_set), iter(test_set) - - def _create_validate_loader(self): - return iter(self.test_set.shuffle(1000000).batch(self.batch_size)) diff --git a/nni/retiarii/evaluator/functional.py b/nni/nas/evaluator/functional.py similarity index 100% rename from nni/retiarii/evaluator/functional.py rename to nni/nas/evaluator/functional.py diff --git a/nni/retiarii/evaluator/pytorch/cgo/evaluator.py b/nni/nas/evaluator/pytorch/cgo/evaluator.py similarity index 100% rename from nni/retiarii/evaluator/pytorch/cgo/evaluator.py rename to nni/nas/evaluator/pytorch/cgo/evaluator.py diff --git a/nni/retiarii/evaluator/pytorch/cgo/trainer.py b/nni/nas/evaluator/pytorch/cgo/trainer.py similarity index 100% rename from nni/retiarii/evaluator/pytorch/cgo/trainer.py rename to nni/nas/evaluator/pytorch/cgo/trainer.py diff --git a/nni/retiarii/evaluator/pytorch/lightning.py b/nni/nas/evaluator/pytorch/lightning.py similarity index 100% rename from nni/retiarii/evaluator/pytorch/lightning.py rename to nni/nas/evaluator/pytorch/lightning.py diff --git a/nni/retiarii/execution/api.py b/nni/nas/execution/api.py similarity index 100% rename from nni/retiarii/execution/api.py rename to nni/nas/execution/api.py diff --git a/nni/retiarii/execution/interface.py b/nni/nas/execution/common/engine.py similarity index 100% rename from nni/retiarii/execution/interface.py rename to nni/nas/execution/common/engine.py diff --git a/nni/retiarii/graph.py b/nni/nas/execution/common/graph.py similarity index 100% rename from nni/retiarii/graph.py rename to nni/nas/execution/common/graph.py diff --git a/nni/retiarii/operation.py b/nni/nas/execution/common/graph_op.py similarity index 100% rename from nni/retiarii/operation.py rename to nni/nas/execution/common/graph_op.py diff --git a/nni/retiarii/integration.py b/nni/nas/execution/common/integration.py similarity index 100% rename from nni/retiarii/integration.py rename to nni/nas/execution/common/integration.py diff --git a/nni/retiarii/integration_api.py b/nni/nas/execution/common/integration_api.py similarity index 100% rename from nni/retiarii/integration_api.py rename to nni/nas/execution/common/integration_api.py diff --git a/nni/retiarii/execution/listener.py b/nni/nas/execution/common/listener.py similarity index 100% rename from nni/retiarii/execution/listener.py rename to nni/nas/execution/common/listener.py diff --git a/nni/retiarii/execution/utils.py b/nni/nas/execution/common/utils.py similarity index 100% rename from nni/retiarii/execution/utils.py rename to nni/nas/execution/common/utils.py diff --git a/nni/retiarii/execution/benchmark.py b/nni/nas/execution/pytorch/benchmark.py similarity index 100% rename from nni/retiarii/execution/benchmark.py rename to nni/nas/execution/pytorch/benchmark.py diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/nas/execution/pytorch/cgo.py similarity index 100% rename from nni/retiarii/execution/cgo_engine.py rename to nni/nas/execution/pytorch/cgo.py diff --git a/nni/retiarii/execution/logical_optimizer/interface.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/interface.py similarity index 100% rename from nni/retiarii/execution/logical_optimizer/interface.py rename to nni/nas/execution/pytorch/cgo/logical_optimizer/interface.py diff --git a/nni/retiarii/execution/logical_optimizer/logical_plan.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/logical_plan.py similarity index 100% rename from nni/retiarii/execution/logical_optimizer/logical_plan.py rename to nni/nas/execution/pytorch/cgo/logical_optimizer/logical_plan.py diff --git a/nni/retiarii/execution/logical_optimizer/opt_dedup_input.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/opt_dedup_input.py similarity index 100% rename from nni/retiarii/execution/logical_optimizer/opt_dedup_input.py rename to nni/nas/execution/pytorch/cgo/logical_optimizer/opt_dedup_input.py diff --git a/nni/retiarii/codegen/pytorch.py b/nni/nas/execution/pytorch/codegen.py similarity index 100% rename from nni/retiarii/codegen/pytorch.py rename to nni/nas/execution/pytorch/codegen.py diff --git a/nni/retiarii/converter/graph_gen.py b/nni/nas/execution/pytorch/converter/graph_gen.py similarity index 100% rename from nni/retiarii/converter/graph_gen.py rename to nni/nas/execution/pytorch/converter/graph_gen.py diff --git a/nni/retiarii/converter/op_types.py b/nni/nas/execution/pytorch/converter/op_types.py similarity index 100% rename from nni/retiarii/converter/op_types.py rename to nni/nas/execution/pytorch/converter/op_types.py diff --git a/nni/retiarii/converter/utils.py b/nni/nas/execution/pytorch/converter/utils.py similarity index 100% rename from nni/retiarii/converter/utils.py rename to nni/nas/execution/pytorch/converter/utils.py diff --git a/nni/retiarii/converter/visualize.py b/nni/nas/execution/pytorch/converter/visualize.py similarity index 100% rename from nni/retiarii/converter/visualize.py rename to nni/nas/execution/pytorch/converter/visualize.py diff --git a/nni/retiarii/execution/base.py b/nni/nas/execution/pytorch/graph.py similarity index 100% rename from nni/retiarii/execution/base.py rename to nni/nas/execution/pytorch/graph.py diff --git a/nni/retiarii/operation_def/torch_op_def.py b/nni/nas/execution/pytorch/op_def.py similarity index 100% rename from nni/retiarii/operation_def/torch_op_def.py rename to nni/nas/execution/pytorch/op_def.py diff --git a/nni/retiarii/execution/python.py b/nni/nas/execution/pytorch/simplified.py similarity index 100% rename from nni/retiarii/execution/python.py rename to nni/nas/execution/pytorch/simplified.py diff --git a/nni/retiarii/operation_def/tf_op_def.py b/nni/nas/execution/tensorflow/op_def.py similarity index 100% rename from nni/retiarii/operation_def/tf_op_def.py rename to nni/nas/execution/tensorflow/op_def.py diff --git a/nni/retiarii/trial_entry.py b/nni/nas/execution/trial_entry.py similarity index 100% rename from nni/retiarii/trial_entry.py rename to nni/nas/execution/trial_entry.py diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/nas/experiment/config/engine_config.py similarity index 100% rename from nni/retiarii/experiment/config/engine_config.py rename to nni/nas/experiment/config/engine_config.py diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py similarity index 100% rename from nni/retiarii/experiment/config/experiment_config.py rename to nni/nas/experiment/config/experiment_config.py diff --git a/nni/retiarii/experiment/pytorch.py b/nni/nas/experiment/pytorch.py similarity index 100% rename from nni/retiarii/experiment/pytorch.py rename to nni/nas/experiment/pytorch.py diff --git a/nni/retiarii/fixed.py b/nni/nas/fixed.py similarity index 100% rename from nni/retiarii/fixed.py rename to nni/nas/fixed.py diff --git a/nni/retiarii/hub/pytorch/autoformer.py b/nni/nas/hub/pytorch/autoformer.py similarity index 100% rename from nni/retiarii/hub/pytorch/autoformer.py rename to nni/nas/hub/pytorch/autoformer.py diff --git a/nni/retiarii/hub/pytorch/mobilenetv3.py b/nni/nas/hub/pytorch/mobilenetv3.py similarity index 100% rename from nni/retiarii/hub/pytorch/mobilenetv3.py rename to nni/nas/hub/pytorch/mobilenetv3.py diff --git a/nni/retiarii/nn/pytorch/hypermodule.py b/nni/nas/hub/pytorch/modules/autoactivation.py similarity index 100% rename from nni/retiarii/nn/pytorch/hypermodule.py rename to nni/nas/hub/pytorch/modules/autoactivation.py diff --git a/nni/retiarii/nn/pytorch/nasbench101.py b/nni/nas/hub/pytorch/modules/nasbench101.py similarity index 100% rename from nni/retiarii/nn/pytorch/nasbench101.py rename to nni/nas/hub/pytorch/modules/nasbench101.py diff --git a/nni/nas/hub/pytorch/modules/nasbench201.py b/nni/nas/hub/pytorch/modules/nasbench201.py new file mode 100644 index 000000000..dc9411fdb --- /dev/null +++ b/nni/nas/hub/pytorch/modules/nasbench201.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +__all__ = ['NasBench201Cell'] + +from collections import OrderedDict +from typing import Callable, List, Dict, Union, Optional + +import torch +import torch.nn as nn + +from nni.nas.nn.pytorch import LayerChoice +from nni.nas.nn.pytorch.mutation_utils import generate_new_label + + +class NasBench201Cell(nn.Module): + """ + Cell structure that is proposed in NAS-Bench-201. + + Proposed by `NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search `__. + + This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor. + For every i < j, there is an edge from i-th node to j-th node. + Each edge in this DAG is associated with an operation transforming the hidden state from the source node + to the target node. All possible operations are selected from a predefined operation set, defined in ``op_candidates``. + Each of the ``op_candidates`` should be a callable that accepts input dimension and output dimension, + and returns a ``Module``. + + Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. For example, + + The space size of this cell would be :math:`|op|^{N(N-1)/2}`, where :math:`|op|` is the number of operation candidates, + and :math:`N` is defined by ``num_tensors``. + + Parameters + ---------- + op_candidates : list of callable + Operation candidates. Each should be a function accepts input feature and output feature, returning nn.Module. + in_features : int + Input dimension of cell. + out_features : int + Output dimension of cell. + num_tensors : int + Number of tensors in the cell (input included). Default: 4 + label : str + Identifier of the cell. Cell sharing the same label will semantically share the same choice. + """ + + @staticmethod + def _make_dict(x): + if isinstance(x, list): + return OrderedDict([(str(i), t) for i, t in enumerate(x)]) + return OrderedDict(x) + + def __init__(self, op_candidates: Union[Dict[str, Callable[[int, int], nn.Module]], List[Callable[[int, int], nn.Module]]], + in_features: int, out_features: int, num_tensors: int = 4, + label: Optional[str] = None): + super().__init__() + self._label = generate_new_label(label) + + self.layers = nn.ModuleList() + self.in_features = in_features + self.out_features = out_features + self.num_tensors = num_tensors + + op_candidates = self._make_dict(op_candidates) + + for tid in range(1, num_tensors): + node_ops = nn.ModuleList() + for j in range(tid): + inp = in_features if j == 0 else out_features + op_choices = OrderedDict([(key, cls(inp, out_features)) + for key, cls in op_candidates.items()]) + node_ops.append(LayerChoice(op_choices, label=f'{self._label}__{j}_{tid}')) # put __ here to be compatible with base engine + self.layers.append(node_ops) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + """ + The forward of input choice is simply selecting first on all choices. + It shouldn't be called directly by users in most cases. + """ + tensors: List[torch.Tensor] = [inputs] + for layer in self.layers: + current_tensor: List[torch.Tensor] = [] + for i, op in enumerate(layer): # type: ignore + current_tensor.append(op(tensors[i])) # type: ignore + tensors.append(torch.sum(torch.stack(current_tensor), 0)) + return tensors[-1] diff --git a/nni/retiarii/hub/pytorch/nasbench101.py b/nni/nas/hub/pytorch/nasbench101.py similarity index 100% rename from nni/retiarii/hub/pytorch/nasbench101.py rename to nni/nas/hub/pytorch/nasbench101.py diff --git a/nni/retiarii/hub/pytorch/nasbench201.py b/nni/nas/hub/pytorch/nasbench201.py similarity index 100% rename from nni/retiarii/hub/pytorch/nasbench201.py rename to nni/nas/hub/pytorch/nasbench201.py diff --git a/nni/retiarii/hub/pytorch/nasnet.py b/nni/nas/hub/pytorch/nasnet.py similarity index 100% rename from nni/retiarii/hub/pytorch/nasnet.py rename to nni/nas/hub/pytorch/nasnet.py diff --git a/nni/retiarii/hub/pytorch/proxylessnas.py b/nni/nas/hub/pytorch/proxylessnas.py similarity index 100% rename from nni/retiarii/hub/pytorch/proxylessnas.py rename to nni/nas/hub/pytorch/proxylessnas.py diff --git a/nni/retiarii/hub/pytorch/shufflenet.py b/nni/nas/hub/pytorch/shufflenet.py similarity index 100% rename from nni/retiarii/hub/pytorch/shufflenet.py rename to nni/nas/hub/pytorch/shufflenet.py diff --git a/nni/retiarii/hub/pytorch/utils/fixed.py b/nni/nas/hub/pytorch/utils/fixed.py similarity index 100% rename from nni/retiarii/hub/pytorch/utils/fixed.py rename to nni/nas/hub/pytorch/utils/fixed.py diff --git a/nni/retiarii/hub/pytorch/utils/pretrained.py b/nni/nas/hub/pytorch/utils/pretrained.py similarity index 100% rename from nni/retiarii/hub/pytorch/utils/pretrained.py rename to nni/nas/hub/pytorch/utils/pretrained.py diff --git a/nni/retiarii/mutator.py b/nni/nas/mutable/mutator.py similarity index 100% rename from nni/retiarii/mutator.py rename to nni/nas/mutable/mutator.py diff --git a/nni/retiarii/nn/pytorch/cell.py b/nni/nas/nn/pytorch/cell.py similarity index 100% rename from nni/retiarii/nn/pytorch/cell.py rename to nni/nas/nn/pytorch/cell.py diff --git a/nni/retiarii/nn/pytorch/api.py b/nni/nas/nn/pytorch/choice.py similarity index 100% rename from nni/retiarii/nn/pytorch/api.py rename to nni/nas/nn/pytorch/choice.py diff --git a/nni/retiarii/nn/pytorch/nn.py b/nni/nas/nn/pytorch/layers.py similarity index 100% rename from nni/retiarii/nn/pytorch/nn.py rename to nni/nas/nn/pytorch/layers.py diff --git a/nni/retiarii/nn/pytorch/mutation_utils.py b/nni/nas/nn/pytorch/mutation_utils.py similarity index 100% rename from nni/retiarii/nn/pytorch/mutation_utils.py rename to nni/nas/nn/pytorch/mutation_utils.py diff --git a/nni/retiarii/nn/pytorch/mutator.py b/nni/nas/nn/pytorch/mutator.py similarity index 100% rename from nni/retiarii/nn/pytorch/mutator.py rename to nni/nas/nn/pytorch/mutator.py diff --git a/nni/retiarii/nn/pytorch/component.py b/nni/nas/nn/pytorch/repeat.py similarity index 61% rename from nni/retiarii/nn/pytorch/component.py rename to nni/nas/nn/pytorch/repeat.py index 3340116f4..6168d947a 100644 --- a/nni/retiarii/nn/pytorch/component.py +++ b/nni/nas/nn/pytorch/repeat.py @@ -3,21 +3,17 @@ import copy import warnings -from collections import OrderedDict -from typing import Callable, List, Dict, Union, Tuple, Optional +from typing import Callable, List, Union, Tuple, Optional -import torch import torch.nn as nn -from nni.retiarii.utils import NoContextError, STATE_DICT_PY_MAPPING_PARTIAL +from nni.nas.utils import NoContextError, STATE_DICT_PY_MAPPING_PARTIAL -from .api import LayerChoice, ValueChoice, ValueChoiceX, ChoiceOf -from .cell import Cell -from .nasbench101 import NasBench101Cell, NasBench101Mutator -from .mutation_utils import Mutable, generate_new_label, get_fixed_value +from .choice import ValueChoice, ValueChoiceX, ChoiceOf +from .mutation_utils import Mutable, get_fixed_value -__all__ = ['Repeat', 'Cell', 'NasBench101Cell', 'NasBench101Mutator', 'NasBench201Cell'] +__all__ = ['Repeat'] class Repeat(Mutable): @@ -159,77 +155,3 @@ class Repeat(Mutable): def __len__(self): return self.max_depth - - -class NasBench201Cell(nn.Module): - """ - Cell structure that is proposed in NAS-Bench-201. - - Proposed by `NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search `__. - - This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor. - For every i < j, there is an edge from i-th node to j-th node. - Each edge in this DAG is associated with an operation transforming the hidden state from the source node - to the target node. All possible operations are selected from a predefined operation set, defined in ``op_candidates``. - Each of the ``op_candidates`` should be a callable that accepts input dimension and output dimension, - and returns a ``Module``. - - Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. For example, - - The space size of this cell would be :math:`|op|^{N(N-1)/2}`, where :math:`|op|` is the number of operation candidates, - and :math:`N` is defined by ``num_tensors``. - - Parameters - ---------- - op_candidates : list of callable - Operation candidates. Each should be a function accepts input feature and output feature, returning nn.Module. - in_features : int - Input dimension of cell. - out_features : int - Output dimension of cell. - num_tensors : int - Number of tensors in the cell (input included). Default: 4 - label : str - Identifier of the cell. Cell sharing the same label will semantically share the same choice. - """ - - @staticmethod - def _make_dict(x): - if isinstance(x, list): - return OrderedDict([(str(i), t) for i, t in enumerate(x)]) - return OrderedDict(x) - - def __init__(self, op_candidates: Union[Dict[str, Callable[[int, int], nn.Module]], List[Callable[[int, int], nn.Module]]], - in_features: int, out_features: int, num_tensors: int = 4, - label: Optional[str] = None): - super().__init__() - self._label = generate_new_label(label) - - self.layers = nn.ModuleList() - self.in_features = in_features - self.out_features = out_features - self.num_tensors = num_tensors - - op_candidates = self._make_dict(op_candidates) - - for tid in range(1, num_tensors): - node_ops = nn.ModuleList() - for j in range(tid): - inp = in_features if j == 0 else out_features - op_choices = OrderedDict([(key, cls(inp, out_features)) - for key, cls in op_candidates.items()]) - node_ops.append(LayerChoice(op_choices, label=f'{self._label}__{j}_{tid}')) # put __ here to be compatible with base engine - self.layers.append(node_ops) - - def forward(self, inputs: torch.Tensor) -> torch.Tensor: - """ - The forward of input choice is simply selecting first on all choices. - It shouldn't be called directly by users in most cases. - """ - tensors: List[torch.Tensor] = [inputs] - for layer in self.layers: - current_tensor: List[torch.Tensor] = [] - for i, op in enumerate(layer): # type: ignore - current_tensor.append(op(tensors[i])) # type: ignore - tensors.append(torch.sum(torch.stack(current_tensor), 0)) - return tensors[-1] diff --git a/nni/retiarii/nn/tensorflow/api.py b/nni/nas/nn/tensorflow/api.py similarity index 100% rename from nni/retiarii/nn/tensorflow/api.py rename to nni/nas/nn/tensorflow/api.py diff --git a/nni/retiarii/oneshot/pytorch/base_lightning.py b/nni/nas/oneshot/pytorch/base_lightning.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/base_lightning.py rename to nni/nas/oneshot/pytorch/base_lightning.py diff --git a/nni/retiarii/oneshot/pytorch/dataloader.py b/nni/nas/oneshot/pytorch/dataloader.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/dataloader.py rename to nni/nas/oneshot/pytorch/dataloader.py diff --git a/nni/retiarii/oneshot/pytorch/differentiable.py b/nni/nas/oneshot/pytorch/differentiable.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/differentiable.py rename to nni/nas/oneshot/pytorch/differentiable.py diff --git a/nni/nas/oneshot/pytorch/enas.py b/nni/nas/oneshot/pytorch/enas.py new file mode 100644 index 000000000..7398e4072 --- /dev/null +++ b/nni/nas/oneshot/pytorch/enas.py @@ -0,0 +1,150 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from typing import cast + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class StackedLSTMCell(nn.Module): + def __init__(self, layers, size, bias): + super().__init__() + self.lstm_num_layers = layers + self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias) + for _ in range(self.lstm_num_layers)]) + + def forward(self, inputs, hidden): + prev_h, prev_c = hidden + next_h, next_c = [], [] + for i, m in enumerate(self.lstm_modules): + curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i])) + next_c.append(curr_c) + next_h.append(curr_h) + # current implementation only supports batch size equals 1, + # but the algorithm does not necessarily have this limitation + inputs = curr_h[-1].view(1, -1) + return next_h, next_c + + +class ReinforceField: + """ + A field with ``name``, with ``total`` choices. ``choose_one`` is true if one and only one is meant to be + selected. Otherwise, any number of choices can be chosen. + """ + + def __init__(self, name, total, choose_one): + self.name = name + self.total = total + self.choose_one = choose_one + + def __repr__(self): + return f'ReinforceField(name={self.name}, total={self.total}, choose_one={self.choose_one})' + + +class ReinforceController(nn.Module): + """ + A controller that mutates the graph with RL. + + Parameters + ---------- + fields : list of ReinforceField + List of fields to choose. + lstm_size : int + Controller LSTM hidden units. + lstm_num_layers : int + Number of layers for stacked LSTM. + tanh_constant : float + Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``. + skip_target : float + Target probability that skipconnect (chosen by InputChoice) will appear. + If the chosen number of inputs is away from the ``skip_connect``, there will be + a sample skip penalty which is a KL divergence added. + temperature : float + Temperature constant that divides the logits. + entropy_reduction : str + Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced. + """ + + def __init__(self, fields, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, + skip_target=0.4, temperature=None, entropy_reduction='sum'): + super(ReinforceController, self).__init__() + self.fields = fields + self.lstm_size = lstm_size + self.lstm_num_layers = lstm_num_layers + self.tanh_constant = tanh_constant + self.temperature = temperature + self.skip_target = skip_target + + self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False) + self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False) + self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False) + self.v_attn = nn.Linear(self.lstm_size, 1, bias=False) + self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1) + self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), # pylint: disable=not-callable + requires_grad=False) + assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.' + self.entropy_reduction = torch.sum if entropy_reduction == 'sum' else torch.mean + self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') + self.soft = nn.ModuleDict({ + field.name: nn.Linear(self.lstm_size, field.total, bias=False) for field in fields + }) + self.embedding = nn.ModuleDict({ + field.name: nn.Embedding(field.total, self.lstm_size) for field in fields + }) + + def resample(self): + self._initialize() + result = dict() + for field in self.fields: + result[field.name] = self._sample_single(field) + return result + + def _initialize(self): + self._inputs = self.g_emb.data + self._c = [torch.zeros((1, self.lstm_size), + dtype=self._inputs.dtype, + device=self._inputs.device) for _ in range(self.lstm_num_layers)] + self._h = [torch.zeros((1, self.lstm_size), + dtype=self._inputs.dtype, + device=self._inputs.device) for _ in range(self.lstm_num_layers)] + self.sample_log_prob: torch.Tensor = cast(torch.Tensor, 0) + self.sample_entropy: torch.Tensor = cast(torch.Tensor, 0) + self.sample_skip_penalty: torch.Tensor = cast(torch.Tensor, 0) + + def _lstm_next_step(self): + self._h, self._c = self.lstm(self._inputs, (self._h, self._c)) + + def _sample_single(self, field): + self._lstm_next_step() + logit = self.soft[field.name](self._h[-1]) + if self.temperature is not None: + logit /= self.temperature + if self.tanh_constant is not None: + logit = self.tanh_constant * torch.tanh(logit) + if field.choose_one: + sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) + log_prob = self.cross_entropy_loss(logit, sampled) + self._inputs = self.embedding[field.name](sampled) + else: + logit = logit.view(-1, 1) + logit = torch.cat([-logit, logit], 1) # pylint: disable=invalid-unary-operand-type + sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) + skip_prob = torch.sigmoid(logit) + kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets)) + self.sample_skip_penalty += kl + log_prob = self.cross_entropy_loss(logit, sampled) + sampled = sampled.nonzero().view(-1) + if sampled.sum().item(): + self._inputs = (torch.sum(self.embedding[field.name](sampled.view(-1)), 0) / (1. + torch.sum(sampled))).unsqueeze(0) + else: + self._inputs = torch.zeros(1, self.lstm_size, device=self.embedding[field.name].weight.device) # type: ignore + + sampled = sampled.detach().cpu().numpy().tolist() + self.sample_log_prob += self.entropy_reduction(log_prob) + entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type + self.sample_entropy += self.entropy_reduction(entropy) + if len(sampled) == 1: + sampled = sampled[0] + return sampled diff --git a/nni/retiarii/oneshot/pytorch/sampling.py b/nni/nas/oneshot/pytorch/sampling.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/sampling.py rename to nni/nas/oneshot/pytorch/sampling.py diff --git a/nni/retiarii/oneshot/pytorch/strategy.py b/nni/nas/oneshot/pytorch/strategy.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/strategy.py rename to nni/nas/oneshot/pytorch/strategy.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py b/nni/nas/oneshot/pytorch/supermodule/_operation_utils.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py rename to nni/nas/oneshot/pytorch/supermodule/_operation_utils.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py b/nni/nas/oneshot/pytorch/supermodule/_singlepathnas.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py rename to nni/nas/oneshot/pytorch/supermodule/_singlepathnas.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py b/nni/nas/oneshot/pytorch/supermodule/_valuechoice_utils.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py rename to nni/nas/oneshot/pytorch/supermodule/_valuechoice_utils.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/base.py b/nni/nas/oneshot/pytorch/supermodule/base.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/base.py rename to nni/nas/oneshot/pytorch/supermodule/base.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/differentiable.py b/nni/nas/oneshot/pytorch/supermodule/differentiable.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/differentiable.py rename to nni/nas/oneshot/pytorch/supermodule/differentiable.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/operation.py b/nni/nas/oneshot/pytorch/supermodule/operation.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/operation.py rename to nni/nas/oneshot/pytorch/supermodule/operation.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/proxyless.py b/nni/nas/oneshot/pytorch/supermodule/proxyless.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/proxyless.py rename to nni/nas/oneshot/pytorch/supermodule/proxyless.py diff --git a/nni/retiarii/oneshot/pytorch/supermodule/sampling.py b/nni/nas/oneshot/pytorch/supermodule/sampling.py similarity index 100% rename from nni/retiarii/oneshot/pytorch/supermodule/sampling.py rename to nni/nas/oneshot/pytorch/supermodule/sampling.py diff --git a/nni/nas/pytorch/__init__.py b/nni/nas/pytorch/__init__.py deleted file mode 100644 index 927935baf..000000000 --- a/nni/nas/pytorch/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .base_mutator import BaseMutator -from .base_trainer import BaseTrainer -from .fixed import apply_fixed_architecture -from .mutables import Mutable, LayerChoice, InputChoice -from .mutator import Mutator -from .trainer import Trainer diff --git a/nni/nas/pytorch/base_mutator.py b/nni/nas/pytorch/base_mutator.py deleted file mode 100644 index df1a5f9ba..000000000 --- a/nni/nas/pytorch/base_mutator.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging - -import torch.nn as nn -from nni.nas.pytorch.mutables import Mutable, MutableScope, InputChoice -from nni.nas.pytorch.utils import StructuredMutableTreeNode - -logger = logging.getLogger(__name__) - - -class BaseMutator(nn.Module): - """ - A mutator is responsible for mutating a graph by obtaining the search space from the network and implementing - callbacks that are called in ``forward`` in mutables. - - Parameters - ---------- - model : nn.Module - PyTorch model to apply mutator on. - """ - - def __init__(self, model): - super().__init__() - self.__dict__["model"] = model - self._structured_mutables = self._parse_search_space(self.model) - - def _parse_search_space(self, module, root=None, prefix="", memo=None, nested_detection=None): - if memo is None: - memo = set() - if root is None: - root = StructuredMutableTreeNode(None) - if module not in memo: - memo.add(module) - if isinstance(module, Mutable): - if nested_detection is not None: - raise RuntimeError("Cannot have nested search space. Error at {} in {}" - .format(module, nested_detection)) - module.name = prefix - module.set_mutator(self) - root = root.add_child(module) - if not isinstance(module, MutableScope): - nested_detection = module - if isinstance(module, InputChoice): - for k in module.choose_from: - if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]: - raise RuntimeError("'{}' required by '{}' not found in keys that appeared before, and is not NO_KEY." - .format(k, module.key)) - for name, submodule in module._modules.items(): - if submodule is None: - continue - submodule_prefix = prefix + ("." if prefix else "") + name - self._parse_search_space(submodule, root, submodule_prefix, memo=memo, - nested_detection=nested_detection) - return root - - @property - def mutables(self): - """ - A generator of all modules inheriting :class:`~nni.nas.pytorch.mutables.Mutable`. - Modules are yielded in the order that they are defined in ``__init__``. - For mutables with their keys appearing multiple times, only the first one will appear. - """ - return self._structured_mutables - - @property - def undedup_mutables(self): - return self._structured_mutables.traverse(deduplicate=False) - - def forward(self, *inputs): - """ - Warnings - -------- - Don't call forward of a mutator. - """ - raise RuntimeError("Forward is undefined for mutators.") - - def __setattr__(self, name, value): - if name == "model": - raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to " - "include you network, as it will include all parameters in model into the mutator.") - return super().__setattr__(name, value) - - def enter_mutable_scope(self, mutable_scope): - """ - Callback when forward of a MutableScope is entered. - - Parameters - ---------- - mutable_scope : MutableScope - The mutable scope that is entered. - """ - pass - - def exit_mutable_scope(self, mutable_scope): - """ - Callback when forward of a MutableScope is exited. - - Parameters - ---------- - mutable_scope : MutableScope - The mutable scope that is exited. - """ - pass - - def on_forward_layer_choice(self, mutable, *args, **kwargs): - """ - Callbacks of forward in LayerChoice. - - Parameters - ---------- - mutable : nni.nas.pytorch.mutables.LayerChoice - Module whose forward is called. - args : list of torch.Tensor - The arguments of its forward function. - kwargs : dict - The keyword arguments of its forward function. - - Returns - ------- - tuple of torch.Tensor and torch.Tensor - Output tensor and mask. - """ - raise NotImplementedError - - def on_forward_input_choice(self, mutable, tensor_list): - """ - Callbacks of forward in InputChoice. - - Parameters - ---------- - mutable : nni.nas.pytorch.mutables.InputChoice - Mutable that is called. - tensor_list : list of torch.Tensor - The arguments mutable is called with. - - Returns - ------- - tuple of torch.Tensor and torch.Tensor - Output tensor and mask. - """ - raise NotImplementedError - - def export(self): - """ - Export the data of all decisions. This should output the decisions of all the mutables, so that the whole - network can be fully determined with these decisions for further training from scratch. - - Returns - ------- - dict - Mappings from mutable keys to decisions. - """ - raise NotImplementedError diff --git a/nni/nas/pytorch/base_trainer.py b/nni/nas/pytorch/base_trainer.py deleted file mode 100644 index 2e7a4a2a2..000000000 --- a/nni/nas/pytorch/base_trainer.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from abc import ABC, abstractmethod - - -class BaseTrainer(ABC): - - @abstractmethod - def train(self): - """ - Override the method to train. - """ - raise NotImplementedError - - @abstractmethod - def validate(self): - """ - Override the method to validate. - """ - raise NotImplementedError - - @abstractmethod - def export(self, file): - """ - Override the method to export to file. - - Parameters - ---------- - file : str - File path to export to. - """ - raise NotImplementedError - - @abstractmethod - def checkpoint(self): - """ - Override to dump a checkpoint. - """ - raise NotImplementedError diff --git a/nni/nas/pytorch/callbacks.py b/nni/nas/pytorch/callbacks.py deleted file mode 100644 index 86a0dc380..000000000 --- a/nni/nas/pytorch/callbacks.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import os - -import torch -import torch.nn as nn - -_logger = logging.getLogger(__name__) - - -class Callback: - """ - Callback provides an easy way to react to events like begin/end of epochs. - """ - - def __init__(self): - self.model = None - self.mutator = None - self.trainer = None - - def build(self, model, mutator, trainer): - """ - Callback needs to be built with model, mutator, trainer, to get updates from them. - - Parameters - ---------- - model : nn.Module - Model to be trained. - mutator : nn.Module - Mutator that mutates the model. - trainer : BaseTrainer - Trainer that is to call the callback. - """ - self.model = model - self.mutator = mutator - self.trainer = trainer - - def on_epoch_begin(self, epoch): - """ - Implement this to do something at the begin of epoch. - - Parameters - ---------- - epoch : int - Epoch number, starting from 0. - """ - pass - - def on_epoch_end(self, epoch): - """ - Implement this to do something at the end of epoch. - - Parameters - ---------- - epoch : int - Epoch number, starting from 0. - """ - pass - - def on_batch_begin(self, epoch): - pass - - def on_batch_end(self, epoch): - pass - - -class LRSchedulerCallback(Callback): - """ - Calls scheduler on every epoch ends. - - Parameters - ---------- - scheduler : LRScheduler - Scheduler to be called. - """ - def __init__(self, scheduler, mode="epoch"): - super().__init__() - assert mode == "epoch" - self.scheduler = scheduler - self.mode = mode - - def on_epoch_end(self, epoch): - """ - Call ``self.scheduler.step()`` on epoch end. - """ - self.scheduler.step() - - -class ArchitectureCheckpoint(Callback): - """ - Calls ``trainer.export()`` on every epoch ends. - - Parameters - ---------- - checkpoint_dir : str - Location to save checkpoints. - """ - def __init__(self, checkpoint_dir): - super().__init__() - self.checkpoint_dir = checkpoint_dir - os.makedirs(self.checkpoint_dir, exist_ok=True) - - def on_epoch_end(self, epoch): - """ - Dump to ``/checkpoint_dir/epoch_{number}.json`` on epoch end. - """ - dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch)) - _logger.info("Saving architecture to %s", dest_path) - self.trainer.export(dest_path) - - -class ModelCheckpoint(Callback): - """ - Calls ``trainer.export()`` on every epoch ends. - - Parameters - ---------- - checkpoint_dir : str - Location to save checkpoints. - """ - def __init__(self, checkpoint_dir): - super().__init__() - self.checkpoint_dir = checkpoint_dir - os.makedirs(self.checkpoint_dir, exist_ok=True) - - def on_epoch_end(self, epoch): - """ - Dump to ``/checkpoint_dir/epoch_{number}.pth.tar`` on every epoch end. - ``DataParallel`` object will have their inside modules exported. - """ - if isinstance(self.model, nn.DataParallel): - state_dict = self.model.module.state_dict() - else: - state_dict = self.model.state_dict() - dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.pth.tar".format(epoch)) - _logger.info("Saving model to %s", dest_path) - torch.save(state_dict, dest_path) diff --git a/nni/nas/pytorch/fixed.py b/nni/nas/pytorch/fixed.py deleted file mode 100644 index 9bfa933e8..000000000 --- a/nni/nas/pytorch/fixed.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging - -from .mutables import InputChoice, LayerChoice, MutableScope -from .mutator import Mutator -from .utils import to_list - - -_logger = logging.getLogger(__name__) - - -class FixedArchitecture(Mutator): - """ - Fixed architecture mutator that always selects a certain graph. - - Parameters - ---------- - model : nn.Module - A mutable network. - fixed_arc : dict - Preloaded architecture object. - strict : bool - Force everything that appears in ``fixed_arc`` to be used at least once. - verbose : bool - Print log messages if set to True - """ - - def __init__(self, model, fixed_arc, strict=True, verbose=True): - super().__init__(model) - self._fixed_arc = fixed_arc - self.verbose = verbose - - mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)]) - fixed_arc_keys = set(self._fixed_arc.keys()) - if fixed_arc_keys - mutable_keys: - raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys)) - if mutable_keys - fixed_arc_keys: - raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys)) - self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc) - - def _from_human_readable_architecture(self, human_arc): - # convert from an exported architecture - result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc. - # First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"}, - # which means {"op1": [0, ]} ir {"op1": ["conv", ]} - result_arc = {k: v if isinstance(v, list) else [v] for k, v in result_arc.items()} - # Second, infer which ones are multi-hot arrays and which ones are in human-readable format. - # This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true]. - # Here, we assume an multihot array has to be a boolean array or a float array and matches the length. - for mutable in self.mutables: - if mutable.key not in result_arc: - continue # skip silently - choice_arr = result_arc[mutable.key] - if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr): - if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \ - (isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)): - # multihot, do nothing - continue - if isinstance(mutable, LayerChoice): - choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr] - choice_arr = [i in choice_arr for i in range(len(mutable))] - elif isinstance(mutable, InputChoice): - choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr] - choice_arr = [i in choice_arr for i in range(mutable.n_candidates)] - result_arc[mutable.key] = choice_arr - return result_arc - - def sample_search(self): - """ - Always returns the fixed architecture. - """ - return self._fixed_arc - - def sample_final(self): - """ - Always returns the fixed architecture. - """ - return self._fixed_arc - - def replace_layer_choice(self, module=None, prefix=""): - """ - Replace layer choices with selected candidates. It's done with best effort. - In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them. - If single choice, replace the module with a normal module. - - Parameters - ---------- - module : nn.Module - Module to be processed. - prefix : str - Module name under global namespace. - """ - if module is None: - module = self.model - for name, mutable in module.named_children(): - global_name = (prefix + "." if prefix else "") + name - if isinstance(mutable, LayerChoice): - chosen = self._fixed_arc[mutable.key] - if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask: - # sum is one, max is one, there has to be an only one - # this is compatible with both integer arrays, boolean arrays and float arrays - if self.verbose: - _logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1)) - setattr(module, name, mutable[chosen.index(1)]) - else: - if mutable.return_mask and self.verbose: - _logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \ - "LayerChoice will not be replaced.") - # remove unused parameters - for ch, n in zip(chosen, mutable.names): - if ch == 0 and not isinstance(ch, float): - setattr(mutable, n, None) - else: - self.replace_layer_choice(mutable, global_name) - - -def apply_fixed_architecture(model, fixed_arc, verbose=True): - """ - Load architecture from `fixed_arc` and apply to model. - - Parameters - ---------- - model : torch.nn.Module - Model with mutables. - fixed_arc : str or dict - Path to the JSON that stores the architecture, or dict that stores the exported architecture. - verbose : bool - Print log messages if set to True - - Returns - ------- - FixedArchitecture - Mutator that is responsible for fixes the graph. - """ - - if isinstance(fixed_arc, str): - with open(fixed_arc) as f: - fixed_arc = json.load(f) - architecture = FixedArchitecture(model, fixed_arc, verbose) - architecture.reset() - - # for the convenience of parameters counting - architecture.replace_layer_choice() - return architecture diff --git a/nni/nas/pytorch/mutables.py b/nni/nas/pytorch/mutables.py deleted file mode 100644 index 7fbb655e5..000000000 --- a/nni/nas/pytorch/mutables.py +++ /dev/null @@ -1,344 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import warnings -from collections import OrderedDict - -import torch.nn as nn - -from nni.nas.pytorch.utils import global_mutable_counting - -logger = logging.getLogger(__name__) - - -class Mutable(nn.Module): - """ - Mutable is designed to function as a normal layer, with all necessary operators' weights. - States and weights of architectures should be included in mutator, instead of the layer itself. - - Mutable has a key, which marks the identity of the mutable. This key can be used by users to share - decisions among different mutables. In mutator's implementation, mutators should use the key to - distinguish different mutables. Mutables that share the same key should be "similar" to each other. - - Currently the default scope for keys is global. By default, the keys uses a global counter from 1 to - produce unique ids. - - Parameters - ---------- - key : str - The key of mutable. - - Notes - ----- - The counter is program level, but mutables are model level. In case multiple models are defined, and - you want to have `counter` starting from 1 in the second model, it's recommended to assign keys manually - instead of using automatic keys. - """ - - def __init__(self, key=None): - super().__init__() - if key is not None: - if not isinstance(key, str): - key = str(key) - logger.warning("Warning: key \"%s\" is not string, converted to string.", key) - self._key = key - else: - self._key = self.__class__.__name__ + str(global_mutable_counting()) - self.init_hook = self.forward_hook = None - - def __deepcopy__(self, memodict=None): - raise NotImplementedError("Deep copy doesn't work for mutables.") - - def __call__(self, *args, **kwargs): - self._check_built() - return super().__call__(*args, **kwargs) - - def set_mutator(self, mutator): - if "mutator" in self.__dict__: - raise RuntimeError("`set_mutator` is called more than once. Did you parse the search space multiple times? " - "Or did you apply multiple fixed architectures?") - self.__dict__["mutator"] = mutator - - @property - def key(self): - """ - Read-only property of key. - """ - return self._key - - @property - def name(self): - """ - After the search space is parsed, it will be the module name of the mutable. - """ - return self._name if hasattr(self, "_name") else self._key - - @name.setter - def name(self, name): - self._name = name - - def _check_built(self): - if not hasattr(self, "mutator"): - raise ValueError( - "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. " - "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` " - "so that trainer can locate all your mutables. See NNI docs for more details.".format(self)) - - -class MutableScope(Mutable): - """ - Mutable scope marks a subgraph/submodule to help mutators make better decisions. - - If not annotated with mutable scope, search space will be flattened as a list. However, some mutators might - need to leverage the concept of a "cell". So if a module is defined as a mutable scope, everything in it will - look like "sub-search-space" in the scope. Scopes can be nested. - - There are two ways mutators can use mutable scope. One is to traverse the search space as a tree during initialization - and reset. The other is to implement `enter_mutable_scope` and `exit_mutable_scope`. They are called before and after - the forward method of the class inheriting mutable scope. - - Mutable scopes are also mutables that are listed in the mutator.mutables (search space), but they are not supposed - to appear in the dict of choices. - - Parameters - ---------- - key : str - Key of mutable scope. - """ - def __init__(self, key): - super().__init__(key=key) - - def _check_built(self): - return True # bypass the test because it's deprecated - - def __call__(self, *args, **kwargs): - if not hasattr(self, 'mutator'): - return super().__call__(*args, **kwargs) - warnings.warn("`MutableScope` is deprecated in Retiarii.", DeprecationWarning) - try: - self._check_built() - self.mutator.enter_mutable_scope(self) - return super().__call__(*args, **kwargs) - finally: - self.mutator.exit_mutable_scope(self) - - -class LayerChoice(Mutable): - """ - Layer choice selects one of the ``op_candidates``, then apply it on inputs and return results. - In rare cases, it can also select zero or many. - - Layer choice does not allow itself to be nested. - - Parameters - ---------- - op_candidates : list of nn.Module or OrderedDict - A module list to be selected from. - reduction : str - ``mean``, ``concat``, ``sum`` or ``none``. Policy if multiples are selected. - If ``none``, a list is returned. ``mean`` returns the average. ``sum`` returns the sum. - ``concat`` concatenate the list at dimension 1. - return_mask : bool - If ``return_mask``, return output tensor and a mask. Otherwise return tensor only. - key : str - Key of the input choice. - - Attributes - ---------- - length : int - Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended. - names : list of str - Names of candidates. - choices : list of Module - Deprecated. A list of all candidate modules in the layer choice module. - ``list(layer_choice)`` is recommended, which will serve the same purpose. - - Notes - ----- - ``op_candidates`` can be a list of modules or a ordered dict of named modules, for example, - - .. code-block:: python - - self.op_choice = LayerChoice(OrderedDict([ - ("conv3x3", nn.Conv2d(3, 16, 128)), - ("conv5x5", nn.Conv2d(5, 16, 128)), - ("conv7x7", nn.Conv2d(7, 16, 128)) - ])) - - Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or - ``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet. - """ - - def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None): - super().__init__(key=key) - self.names = [] - if isinstance(op_candidates, OrderedDict): - for name, module in op_candidates.items(): - assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \ - "Please don't use a reserved name '{}' for your module.".format(name) - self.add_module(name, module) - self.names.append(name) - elif isinstance(op_candidates, list): - for i, module in enumerate(op_candidates): - self.add_module(str(i), module) - self.names.append(str(i)) - else: - raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates))) - self.reduction = reduction - self.return_mask = return_mask - - def __getitem__(self, idx): - if isinstance(idx, str): - return self._modules[idx] - return list(self)[idx] - - def __setitem__(self, idx, module): - key = idx if isinstance(idx, str) else self.names[idx] - return setattr(self, key, module) - - def __delitem__(self, idx): - if isinstance(idx, slice): - for key in self.names[idx]: - delattr(self, key) - else: - if isinstance(idx, str): - key, idx = idx, self.names.index(idx) - else: - key = self.names[idx] - delattr(self, key) - del self.names[idx] - - @property - def length(self): - warnings.warn("layer_choice.length is deprecated. Use `len(layer_choice)` instead.", DeprecationWarning) - return len(self) - - def __len__(self): - return len(self.names) - - def __iter__(self): - return map(lambda name: self._modules[name], self.names) - - @property - def choices(self): - warnings.warn("layer_choice.choices is deprecated. Use `list(layer_choice)` instead.", DeprecationWarning) - return list(self) - - def forward(self, *args, **kwargs): - """ - Returns - ------- - tuple of tensors - Output and selection mask. If ``return_mask`` is ``False``, only output is returned. - """ - out, mask = self.mutator.on_forward_layer_choice(self, *args, **kwargs) - if self.return_mask: - return out, mask - return out - - -class InputChoice(Mutable): - """ - Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys). For beginners, - use ``n_candidates`` instead of ``choose_from`` is a safe option. To get the most power out of it, you might want to - know about ``choose_from``. - - The keys in ``choose_from`` can be keys that appear in past mutables, or ``NO_KEY`` if there are no suitable ones. - The keys are designed to be the keys of the sources. To help mutators make better decisions, - mutators might be interested in how the tensors to choose from come into place. For example, the tensor is the - output of some operator, some node, some cell, or some module. If this operator happens to be a mutable (e.g., - ``LayerChoice`` or ``InputChoice``), it has a key naturally that can be used as a source key. If it's a - module/submodule, it needs to be annotated with a key: that's where a :class:`MutableScope` is needed. - - In the example below, ``input_choice`` is a 4-choose-any. The first 3 is semantically output of cell1, output of cell2, - output of cell3 with respectively. Notice that an extra max pooling is followed by cell1, indicating x1 is not - "actually" the direct output of cell1. - - .. code-block:: python - - class Cell(MutableScope): - pass - - class Net(nn.Module): - def __init__(self): - self.cell1 = Cell("cell1") - self.cell2 = Cell("cell2") - self.op = LayerChoice([conv3x3(), conv5x5()], key="op") - self.input_choice = InputChoice(choose_from=["cell1", "cell2", "op", InputChoice.NO_KEY]) - - def forward(self, x): - x1 = max_pooling(self.cell1(x)) - x2 = self.cell2(x) - x3 = self.op(x) - x4 = torch.zeros_like(x) - return self.input_choice([x1, x2, x3, x4]) - - Parameters - ---------- - n_candidates : int - Number of inputs to choose from. - choose_from : list of str - List of source keys to choose from. At least of one of ``choose_from`` and ``n_candidates`` must be fulfilled. - If ``n_candidates`` has a value but ``choose_from`` is None, it will be automatically treated as ``n_candidates`` - number of empty string. - n_chosen : int - Recommended inputs to choose. If None, mutator is instructed to select any. - reduction : str - ``mean``, ``concat``, ``sum`` or ``none``. See :class:`LayerChoice`. - return_mask : bool - If ``return_mask``, return output tensor and a mask. Otherwise return tensor only. - key : str - Key of the input choice. - """ - - NO_KEY = "" - - def __init__(self, n_candidates=None, choose_from=None, n_chosen=None, - reduction="sum", return_mask=False, key=None): - super().__init__(key=key) - # precondition check - assert n_candidates is not None or choose_from is not None, "At least one of `n_candidates` and `choose_from`" \ - "must be not None." - if choose_from is not None and n_candidates is None: - n_candidates = len(choose_from) - elif choose_from is None and n_candidates is not None: - choose_from = [self.NO_KEY] * n_candidates - assert n_candidates == len(choose_from), "Number of candidates must be equal to the length of `choose_from`." - assert n_candidates > 0, "Number of candidates must be greater than 0." - assert n_chosen is None or 0 <= n_chosen <= n_candidates, "Expected selected number must be None or no more " \ - "than number of candidates." - - self.n_candidates = n_candidates - self.choose_from = choose_from.copy() - self.n_chosen = n_chosen - self.reduction = reduction - self.return_mask = return_mask - - def forward(self, optional_inputs): - """ - Forward method of LayerChoice. - - Parameters - ---------- - optional_inputs : list or dict - Recommended to be a dict. As a dict, inputs will be converted to a list that follows the order of - ``choose_from`` in initialization. As a list, inputs must follow the semantic order that is the same as - ``choose_from``. - - Returns - ------- - tuple of tensors - Output and selection mask. If ``return_mask`` is ``False``, only output is returned. - """ - optional_input_list = optional_inputs - if isinstance(optional_inputs, dict): - optional_input_list = [optional_inputs[tag] for tag in self.choose_from] - assert isinstance(optional_input_list, list), \ - "Optional input list must be a list, not a {}.".format(type(optional_input_list)) - assert len(optional_inputs) == self.n_candidates, \ - "Length of the input list must be equal to number of candidates." - out, mask = self.mutator.on_forward_input_choice(self, optional_input_list) - if self.return_mask: - return out, mask - return out diff --git a/nni/nas/pytorch/mutator.py b/nni/nas/pytorch/mutator.py deleted file mode 100644 index e1894b524..000000000 --- a/nni/nas/pytorch/mutator.py +++ /dev/null @@ -1,308 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -from collections import defaultdict - -import numpy as np -import torch - -from .base_mutator import BaseMutator -from .mutables import LayerChoice, InputChoice -from .utils import to_list - -logger = logging.getLogger(__name__) - - -class Mutator(BaseMutator): - - def __init__(self, model): - super().__init__(model) - self._cache = dict() - self._connect_all = False - - def sample_search(self): - """ - Override to implement this method to iterate over mutables and make decisions. - - Returns - ------- - dict - A mapping from key of mutables to decisions. - """ - raise NotImplementedError - - def sample_final(self): - """ - Override to implement this method to iterate over mutables and make decisions that is final - for export and retraining. - - Returns - ------- - dict - A mapping from key of mutables to decisions. - """ - raise NotImplementedError - - def reset(self): - """ - Reset the mutator by call the `sample_search` to resample (for search). Stores the result in a local - variable so that `on_forward_layer_choice` and `on_forward_input_choice` can use the decision directly. - """ - self._cache = self.sample_search() - - def export(self): - """ - Resample (for final) and return results. - - Returns - ------- - dict - A mapping from key of mutables to decisions. - """ - sampled = self.sample_final() - result = dict() - for mutable in self.mutables: - if not isinstance(mutable, (LayerChoice, InputChoice)): - # not supported as built-in - continue - result[mutable.key] = self._convert_mutable_decision_to_human_readable(mutable, sampled.pop(mutable.key)) - if sampled: - raise ValueError("Unexpected keys returned from 'sample_final()': %s", list(sampled.keys())) - return result - - def status(self): - """ - Return current selection status of mutator. - - Returns - ------- - dict - A mapping from key of mutables to decisions. All weights (boolean type and float type) - are converted into real number values. Numpy arrays and tensors are converted into list. - """ - data = dict() - for k, v in self._cache.items(): - if torch.is_tensor(v): - v = v.detach().cpu().numpy().tolist() - if isinstance(v, np.ndarray): - v = v.astype(np.float32).tolist() - data[k] = v - return data - - def graph(self, inputs): - """ - Return model supernet graph. - - Parameters - ---------- - inputs: tuple of tensor - Inputs that will be feeded into the network. - - Returns - ------- - dict - Containing ``node``, in Tensorboard GraphDef format. - Additional key ``mutable`` is a map from key to list of modules. - """ - if not torch.__version__.startswith("1.4"): - logger.warning("Graph is only tested with PyTorch 1.4. Other versions might not work.") - from nni.common.graph_utils import build_graph - from google.protobuf import json_format - # protobuf should be installed as long as tensorboard is installed - try: - self._connect_all = True - graph_def, _ = build_graph(self.model, inputs, verbose=False) - result = json_format.MessageToDict(graph_def) - finally: - self._connect_all = False - - # `mutable` is to map the keys to a list of corresponding modules. - # A key can be linked to multiple modules, use `dedup=False` to find them all. - result["mutable"] = defaultdict(list) - for mutable in self.mutables.traverse(deduplicate=False): - # A module will be represent in the format of - # [{"type": "Net", "name": ""}, {"type": "Cell", "name": "cell1"}, {"type": "Conv2d": "name": "conv"}] - # which will be concatenated into Net/Cell[cell1]/Conv2d[conv] in frontend. - # This format is aligned with the scope name jit gives. - modules = mutable.name.split(".") - path = [ - {"type": self.model.__class__.__name__, "name": ""} - ] - m = self.model - for module in modules: - m = getattr(m, module) - path.append({ - "type": m.__class__.__name__, - "name": module - }) - result["mutable"][mutable.key].append(path) - return result - - def on_forward_layer_choice(self, mutable, *args, **kwargs): - """ - On default, this method retrieves the decision obtained previously, and select certain operations. - Only operations with non-zero weight will be executed. The results will be added to a list. - Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`. - - Parameters - ---------- - mutable : nni.nas.pytorch.mutables.LayerChoice - Layer choice module. - args : list of torch.Tensor - Inputs - kwargs : dict - Inputs - - Returns - ------- - tuple of torch.Tensor and torch.Tensor - Output and mask. - """ - if self._connect_all: - return self._all_connect_tensor_reduction(mutable.reduction, - [op(*args, **kwargs) for op in mutable]), \ - torch.ones(len(mutable)).bool() - - def _map_fn(op, args, kwargs): - return op(*args, **kwargs) - - mask = self._get_decision(mutable) - assert len(mask) == len(mutable), \ - "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable)) - out, mask = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask) - return self._tensor_reduction(mutable.reduction, out), mask - - def on_forward_input_choice(self, mutable, tensor_list): - """ - On default, this method retrieves the decision obtained previously, and select certain tensors. - Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`. - - Parameters - ---------- - mutable : nni.nas.pytorch.mutables.InputChoice - Input choice module. - tensor_list : list of torch.Tensor - Tensor list to apply the decision on. - - Returns - ------- - tuple of torch.Tensor and torch.Tensor - Output and mask. - """ - if self._connect_all: - return self._all_connect_tensor_reduction(mutable.reduction, tensor_list), \ - torch.ones(mutable.n_candidates).bool() - mask = self._get_decision(mutable) - assert len(mask) == mutable.n_candidates, \ - "Invalid mask, expected {} to be of length {}.".format(mask, mutable.n_candidates) - out, mask = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask) - return self._tensor_reduction(mutable.reduction, out), mask - - def _select_with_mask(self, map_fn, candidates, mask): - """ - Select masked tensors and return a list of tensors. - - Parameters - ---------- - map_fn : function - Convert candidates to target candidates. Can be simply identity. - candidates : list of torch.Tensor - Tensor list to apply the decision on. - mask : list-like object - Can be a list, an numpy array or a tensor (recommended). Needs to - have the same length as ``candidates``. - - Returns - ------- - tuple of list of torch.Tensor and torch.Tensor - Output and mask. - """ - if (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], bool)) or \ - (isinstance(mask, np.ndarray) and mask.dtype == np.bool) or \ - "BoolTensor" in mask.type(): - out = [map_fn(*cand) for cand, m in zip(candidates, mask) if m] - elif (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], (float, int))) or \ - (isinstance(mask, np.ndarray) and mask.dtype in (np.float32, np.float64, np.int32, np.int64)) or \ - "FloatTensor" in mask.type(): - out = [map_fn(*cand) * m for cand, m in zip(candidates, mask) if m] - else: - raise ValueError("Unrecognized mask '%s'" % mask) - if not torch.is_tensor(mask): - mask = torch.tensor(mask) # pylint: disable=not-callable - return out, mask - - def _tensor_reduction(self, reduction_type, tensor_list): - if reduction_type == "none": - return tensor_list - if not tensor_list: - return None # empty. return None for now - if len(tensor_list) == 1: - return tensor_list[0] - if reduction_type == "sum": - return sum(tensor_list) - if reduction_type == "mean": - return sum(tensor_list) / len(tensor_list) - if reduction_type == "concat": - return torch.cat(tensor_list, dim=1) - raise ValueError("Unrecognized reduction policy: \"{}\"".format(reduction_type)) - - def _all_connect_tensor_reduction(self, reduction_type, tensor_list): - if reduction_type == "none": - return tensor_list - if reduction_type == "concat": - return torch.cat(tensor_list, dim=1) - return torch.stack(tensor_list).sum(0) - - def _get_decision(self, mutable): - """ - By default, this method checks whether `mutable.key` is already in the decision cache, - and returns the result without double-check. - - Parameters - ---------- - mutable : Mutable - - Returns - ------- - object - """ - if mutable.key not in self._cache: - raise ValueError("\"{}\" not found in decision cache.".format(mutable.key)) - result = self._cache[mutable.key] - logger.debug("Decision %s: %s", mutable.key, result) - return result - - def _convert_mutable_decision_to_human_readable(self, mutable, sampled): - # Assert the existence of mutable.key in returned architecture. - # Also check if there is anything extra. - multihot_list = to_list(sampled) - converted = None - # If it's a boolean array, we can do optimization. - if all([t == 0 or t == 1 for t in multihot_list]): - if isinstance(mutable, LayerChoice): - assert len(multihot_list) == len(mutable), \ - "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ - % (mutable.key, multihot_list) - # check if all modules have different names and they indeed have names - if len(set(mutable.names)) == len(mutable) and not all(d.isdigit() for d in mutable.names): - converted = [name for i, name in enumerate(mutable.names) if multihot_list[i]] - else: - converted = [i for i in range(len(multihot_list)) if multihot_list[i]] - if isinstance(mutable, InputChoice): - assert len(multihot_list) == mutable.n_candidates, \ - "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ - % (mutable.key, multihot_list) - # check if all input candidates have different names - if len(set(mutable.choose_from)) == mutable.n_candidates: - converted = [name for i, name in enumerate(mutable.choose_from) if multihot_list[i]] - else: - converted = [i for i in range(len(multihot_list)) if multihot_list[i]] - if converted is not None: - # if only one element, then remove the bracket - if len(converted) == 1: - converted = converted[0] - else: - # do nothing - converted = multihot_list - return converted diff --git a/nni/nas/pytorch/nasbench201/__init__.py b/nni/nas/pytorch/nasbench201/__init__.py deleted file mode 100644 index df3f68e0b..000000000 --- a/nni/nas/pytorch/nasbench201/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .nasbench201 import NASBench201Cell diff --git a/nni/nas/pytorch/nasbench201/nasbench201.py b/nni/nas/pytorch/nasbench201/nasbench201.py deleted file mode 100644 index 753d9b3db..000000000 --- a/nni/nas/pytorch/nasbench201/nasbench201.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from collections import OrderedDict -import torch.nn as nn -from nni.nas.pytorch.mutables import LayerChoice - -from .nasbench201_ops import Pooling, ReLUConvBN, Zero, FactorizedReduce - - -class NASBench201Cell(nn.Module): - """ - Builtin cell structure of NAS Bench 201. One cell contains four nodes. The First node serves as an input node - accepting the output of the previous cell. And other nodes connect to all previous nodes with an edge that - represents an operation chosen from a set to transform the tensor from the source node to the target node. - Every node accepts all its inputs and adds them as its output. - - Parameters - --- - cell_id: str - the name of this cell - C_in: int - the number of input channels of the cell - C_out: int - the number of output channels of the cell - stride: int - stride of all convolution operations in the cell - bn_affine: bool - If set to ``True``, all ``torch.nn.BatchNorm2d`` in this cell will have learnable affine parameters. Default: True - bn_momentum: float - the value used for the running_mean and running_var computation. Default: 0.1 - bn_track_running_stats: bool - When set to ``True``, all ``torch.nn.BatchNorm2d`` in this cell tracks the running mean and variance. Default: True - """ - - def __init__(self, cell_id, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True): - super(NASBench201Cell, self).__init__() - - self.NUM_NODES = 4 - self.layers = nn.ModuleList() - - OPS = lambda layer_idx: OrderedDict([ - ("none", Zero(C_in, C_out, stride)), - ("avg_pool_3x3", Pooling(C_in, C_out, stride if layer_idx == 0 else 1, bn_affine, bn_momentum, - bn_track_running_stats)), - ("conv_3x3", ReLUConvBN(C_in, C_out, 3, stride if layer_idx == 0 else 1, 1, 1, bn_affine, bn_momentum, - bn_track_running_stats)), - ("conv_1x1", ReLUConvBN(C_in, C_out, 1, stride if layer_idx == 0 else 1, 0, 1, bn_affine, bn_momentum, - bn_track_running_stats)), - ("skip_connect", nn.Identity() if stride == 1 and C_in == C_out - else FactorizedReduce(C_in, C_out, stride if layer_idx == 0 else 1, bn_affine, bn_momentum, - bn_track_running_stats)) - ]) - - for i in range(self.NUM_NODES): - node_ops = nn.ModuleList() - for j in range(0, i): - node_ops.append(LayerChoice(OPS(j), key="%d_%d" % (j, i), reduction="mean")) - self.layers.append(node_ops) - self.in_dim = C_in - self.out_dim = C_out - self.cell_id = cell_id - - def forward(self, input): # pylint: disable=W0622 - """ - Parameters - --- - input: torch.tensor - the output of the previous layer - """ - nodes = [input] - for i in range(1, self.NUM_NODES): - node_feature = sum(self.layers[i][k](nodes[k]) for k in range(i)) - nodes.append(node_feature) - return nodes[-1] diff --git a/nni/nas/pytorch/nasbench201/nasbench201_ops.py b/nni/nas/pytorch/nasbench201/nasbench201_ops.py deleted file mode 100644 index 633be3220..000000000 --- a/nni/nas/pytorch/nasbench201/nasbench201_ops.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn - - -class ReLUConvBN(nn.Module): - """ - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - stride: int - stride of the convolution - padding: int - zero-padding added to both sides of the input - dilation: int - spacing between kernel elements - bn_affine: bool - If set to ``True``, ``torch.nn.BatchNorm2d`` will have learnable affine parameters. Default: True - bn_momentun: float - the value used for the running_mean and running_var computation. Default: 0.1 - bn_track_running_stats: bool - When set to ``True``, ``torch.nn.BatchNorm2d`` tracks the running mean and variance. Default: True - """ - def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, - bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True): - super(ReLUConvBN, self).__init__() - self.op = nn.Sequential( - nn.ReLU(inplace=False), - nn.Conv2d(C_in, C_out, kernel_size, stride=stride, - padding=padding, dilation=dilation, bias=False), - nn.BatchNorm2d(C_out, affine=bn_affine, momentum=bn_momentum, - track_running_stats=bn_track_running_stats) - ) - - def forward(self, x): - """ - Parameters - --- - x: torch.Tensor - input tensor - """ - return self.op(x) - - -class Pooling(nn.Module): - """ - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - stride: int - stride of the convolution - bn_affine: bool - If set to ``True``, ``torch.nn.BatchNorm2d`` will have learnable affine parameters. Default: True - bn_momentun: float - the value used for the running_mean and running_var computation. Default: 0.1 - bn_track_running_stats: bool - When set to ``True``, ``torch.nn.BatchNorm2d`` tracks the running mean and variance. Default: True - """ - def __init__(self, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True): - super(Pooling, self).__init__() - if C_in == C_out: - self.preprocess = None - else: - self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 0, - bn_affine, bn_momentum, bn_track_running_stats) - self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False) - - def forward(self, x): - """ - Parameters - --- - x: torch.Tensor - input tensor - """ - if self.preprocess: - x = self.preprocess(x) - return self.op(x) - - -class Zero(nn.Module): - """ - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - stride: int - stride of the convolution - """ - def __init__(self, C_in, C_out, stride): - super(Zero, self).__init__() - self.C_in = C_in - self.C_out = C_out - self.stride = stride - self.is_zero = True - - def forward(self, x): - """ - Parameters - --- - x: torch.Tensor - input tensor - """ - if self.C_in == self.C_out: - if self.stride == 1: - return x.mul(0.) - else: - return x[:, :, ::self.stride, ::self.stride].mul(0.) - else: - shape = list(x.shape) - shape[1] = self.C_out - zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device) - return zeros - - -class FactorizedReduce(nn.Module): - def __init__(self, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1, - bn_track_running_stats=True): - super(FactorizedReduce, self).__init__() - self.stride = stride - self.C_in = C_in - self.C_out = C_out - self.relu = nn.ReLU(inplace=False) - if stride == 2: - C_outs = [C_out // 2, C_out - C_out // 2] - self.convs = nn.ModuleList() - for i in range(2): - self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False)) - self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0) - else: - raise ValueError("Invalid stride : {:}".format(stride)) - self.bn = nn.BatchNorm2d(C_out, affine=bn_affine, momentum=bn_momentum, - track_running_stats=bn_track_running_stats) - - def forward(self, x): - x = self.relu(x) - y = self.pad(x) - out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1) - out = self.bn(out) - return out diff --git a/nni/nas/pytorch/search_space_zoo/__init__.py b/nni/nas/pytorch/search_space_zoo/__init__.py deleted file mode 100644 index b31d4199a..000000000 --- a/nni/nas/pytorch/search_space_zoo/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from .darts_cell import DartsCell -from .enas_cell import ENASMicroLayer -from .enas_cell import ENASMacroLayer -from .enas_cell import ENASMacroGeneralModel diff --git a/nni/nas/pytorch/search_space_zoo/darts_cell.py b/nni/nas/pytorch/search_space_zoo/darts_cell.py deleted file mode 100644 index 53fca5940..000000000 --- a/nni/nas/pytorch/search_space_zoo/darts_cell.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from collections import OrderedDict - -import torch -import torch.nn as nn -from nni.nas.pytorch import mutables - -from .darts_ops import PoolBN, SepConv, DilConv, FactorizedReduce, DropPath, StdConv - - -class Node(nn.Module): - def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect): - """ - builtin Darts Node structure - - Parameters - --- - node_id: str - num_prev_nodes: int - the number of previous nodes in this cell - channels: int - output channels - num_downsample_connect: int - downsample the input node if this cell is reduction cell - """ - super().__init__() - self.ops = nn.ModuleList() - choice_keys = [] - for i in range(num_prev_nodes): - stride = 2 if i < num_downsample_connect else 1 - choice_keys.append("{}_p{}".format(node_id, i)) - self.ops.append( - mutables.LayerChoice(OrderedDict([ - ("maxpool", PoolBN('max', channels, 3, stride, 1, affine=False)), - ("avgpool", PoolBN('avg', channels, 3, stride, 1, affine=False)), - ("skipconnect", - nn.Identity() if stride == 1 else FactorizedReduce(channels, channels, affine=False)), - ("sepconv3x3", SepConv(channels, channels, 3, stride, 1, affine=False)), - ("sepconv5x5", SepConv(channels, channels, 5, stride, 2, affine=False)), - ("dilconv3x3", DilConv(channels, channels, 3, stride, 2, 2, affine=False)), - ("dilconv5x5", DilConv(channels, channels, 5, stride, 4, 2, affine=False)) - ]), key=choice_keys[-1])) - self.drop_path = DropPath() - self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id)) - - def forward(self, prev_nodes): - assert len(self.ops) == len(prev_nodes) - out = [op(node) for op, node in zip(self.ops, prev_nodes)] - out = [self.drop_path(o) if o is not None else None for o in out] - return self.input_switch(out) - - -class DartsCell(nn.Module): - """ - Builtin Darts Cell structure. There are ``n_nodes`` nodes in one cell, in which the first two nodes' values are - fixed to the results of previous previous cell and previous cell respectively. One node will connect all - the nodes after with predefined operations in a mutable way. The last node accepts five inputs from nodes - before and it concats all inputs in channels as the output of the current cell, and the number of output - channels is ``n_nodes`` times ``channels``. - - Parameters - --- - n_nodes: int - the number of nodes contained in this cell - channels_pp: int - the number of previous previous cell's output channels - channels_p: int - the number of previous cell's output channels - channels: int - the number of output channels for each node - reduction_p: bool - Is previous cell a reduction cell - reduction: bool - is current cell a reduction cell - """ - def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction): - super().__init__() - self.reduction = reduction - self.n_nodes = n_nodes - - # If previous cell is reduction cell, current input size does not match with - # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. - if reduction_p: - self.preproc0 = FactorizedReduce(channels_pp, channels, affine=False) - else: - self.preproc0 = StdConv(channels_pp, channels, 1, 1, 0, affine=False) - self.preproc1 = StdConv(channels_p, channels, 1, 1, 0, affine=False) - - # generate dag - self.mutable_ops = nn.ModuleList() - for depth in range(2, self.n_nodes + 2): - self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), - depth, channels, 2 if reduction else 0)) - - def forward(self, pprev, prev): - """ - Parameters - --- - pprev: torch.Tensor - the output of the previous previous layer - prev: torch.Tensor - the output of the previous layer - """ - tensors = [self.preproc0(pprev), self.preproc1(prev)] - for node in self.mutable_ops: - cur_tensor = node(tensors) - tensors.append(cur_tensor) - - output = torch.cat(tensors[2:], dim=1) - return output diff --git a/nni/nas/pytorch/search_space_zoo/darts_ops.py b/nni/nas/pytorch/search_space_zoo/darts_ops.py deleted file mode 100644 index ce5410cfb..000000000 --- a/nni/nas/pytorch/search_space_zoo/darts_ops.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn - - -class DropPath(nn.Module): - def __init__(self, p=0.): - """ - Drop path with probability. - - Parameters - ---------- - p : float - Probability of an path to be zeroed. - """ - super().__init__() - self.p = p - - def forward(self, x): - if self.training and self.p > 0.: - keep_prob = 1. - self.p - # per data point mask - mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob) - return x / keep_prob * mask - - return x - - -class PoolBN(nn.Module): - """ - AvgPool or MaxPool with BN. ``pool_type`` must be ``max`` or ``avg``. - - Parameters - --- - pool_type: str - choose operation - C: int - number of channels - kernal_size: int - size of the convolving kernel - stride: int - stride of the convolution - padding: int - zero-padding added to both sides of the input - affine: bool - is using affine in BatchNorm - """ - - def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True): - super().__init__() - if pool_type.lower() == 'max': - self.pool = nn.MaxPool2d(kernel_size, stride, padding) - elif pool_type.lower() == 'avg': - self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False) - else: - raise ValueError() - - self.bn = nn.BatchNorm2d(C, affine=affine) - - def forward(self, x): - out = self.pool(x) - out = self.bn(out) - return out - - -class StdConv(nn.Sequential): - """ - Standard conv: ReLU - Conv - BN - - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - kernel_size: int - size of the convolution kernel - padding: - zero-padding added to both sides of the input - affine: bool - is using affine in BatchNorm - """ - - def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): - super().__init__() - self.net = nn.Sequential - for idx, ops in enumerate((nn.ReLU(), nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False), - nn.BatchNorm2d(C_out, affine=affine))): - self.add_module(str(idx), ops) - - -class FacConv(nn.Module): - """ - Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN - """ - - def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True): - super().__init__() - self.net = nn.Sequential( - nn.ReLU(), - nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False), - nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False), - nn.BatchNorm2d(C_out, affine=affine) - ) - - def forward(self, x): - return self.net(x) - - -class DilConv(nn.Module): - """ - (Dilated) depthwise separable conv. - ReLU - (Dilated) depthwise separable - Pointwise - BN. - If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field. - - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - kernal_size: - size of the convolving kernel - padding: - zero-padding added to both sides of the input - dilation: int - spacing between kernel elements. - affine: bool - is using affine in BatchNorm - """ - - def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True): - super().__init__() - self.net = nn.Sequential( - nn.ReLU(), - nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in, - bias=False), - nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(C_out, affine=affine) - ) - - def forward(self, x): - return self.net(x) - - -class SepConv(nn.Module): - """ - Depthwise separable conv. - DilConv(dilation=1) * 2. - - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - kernal_size: - size of the convolving kernel - padding: - zero-padding added to both sides of the input - dilation: int - spacing between kernel elements. - affine: bool - is using affine in BatchNorm - """ - - def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): - super().__init__() - self.net = nn.Sequential( - DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine), - DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine) - ) - - def forward(self, x): - return self.net(x) - - -class FactorizedReduce(nn.Module): - """ - Reduce feature map size by factorized pointwise (stride=2). - """ - - def __init__(self, C_in, C_out, affine=True): - super().__init__() - self.relu = nn.ReLU() - self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) - self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) - self.bn = nn.BatchNorm2d(C_out, affine=affine) - - def forward(self, x): - x = self.relu(x) - out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1) - out = self.bn(out) - return out diff --git a/nni/nas/pytorch/search_space_zoo/enas_cell.py b/nni/nas/pytorch/search_space_zoo/enas_cell.py deleted file mode 100644 index de57d55e2..000000000 --- a/nni/nas/pytorch/search_space_zoo/enas_cell.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from nni.nas.pytorch import mutables -from .enas_ops import FactorizedReduce, StdConv, SepConvBN, Pool, ConvBranch, PoolBranch - - -class Cell(nn.Module): - def __init__(self, cell_name, prev_labels, channels): - super().__init__() - self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True, - key=cell_name + "_input") - self.op_choice = mutables.LayerChoice([ - SepConvBN(channels, channels, 3, 1), - SepConvBN(channels, channels, 5, 2), - Pool("avg", 3, 1, 1), - Pool("max", 3, 1, 1), - nn.Identity() - ], key=cell_name + "_op") - - def forward(self, prev_layers): - chosen_input, chosen_mask = self.input_choice(prev_layers) - cell_out = self.op_choice(chosen_input) - return cell_out, chosen_mask - - -class Node(mutables.MutableScope): - def __init__(self, node_name, prev_node_names, channels): - super().__init__(node_name) - self.cell_x = Cell(node_name + "_x", prev_node_names, channels) - self.cell_y = Cell(node_name + "_y", prev_node_names, channels) - - def forward(self, prev_layers): - out_x, mask_x = self.cell_x(prev_layers) - out_y, mask_y = self.cell_y(prev_layers) - return out_x + out_y, mask_x | mask_y - - -class Calibration(nn.Module): - def __init__(self, in_channels, out_channels): - super().__init__() - self.process = None - if in_channels != out_channels: - self.process = StdConv(in_channels, out_channels) - - def forward(self, x): - if self.process is None: - return x - return self.process(x) - - -class ENASMicroLayer(nn.Module): - """ - Builtin EnasMicroLayer. Micro search designs only one building block whose architecture is repeated - throughout the final architecture. A cell has ``num_nodes`` nodes and searches the topology and - operations among them in RL way. The first two nodes in a layer stand for the outputs from previous - previous layer and previous layer respectively. For the following nodes, the controller chooses - two previous nodes and applies two operations respectively for each node. Nodes that are not served - as input for any other node are viewed as the output of the layer. If there are multiple output nodes, - the model will calculate the average of these nodes as the layer output. Every node's output has ``out_channels`` - channels so the result of the layer has the same number of channels as each node. - - Parameters - --- - num_nodes: int - the number of nodes contained in this layer - in_channles_pp: int - the number of previous previous layer's output channels - in_channels_p: int - the number of previous layer's output channels - out_channels: int - output channels of this layer - reduction: bool - is reduction operation empolyed before this layer - """ - def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction): - super().__init__() - self.reduction = reduction - if self.reduction: - self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False) - self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False) - in_channels_pp = in_channels_p = out_channels - self.preproc0 = Calibration(in_channels_pp, out_channels) - self.preproc1 = Calibration(in_channels_p, out_channels) - - self.num_nodes = num_nodes - name_prefix = "reduce" if reduction else "normal" - self.nodes = nn.ModuleList() - node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY] - for i in range(num_nodes): - node_labels.append("{}_node_{}".format(name_prefix, i)) - self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels)) - self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1), - requires_grad=True) - self.bn = nn.BatchNorm2d(out_channels, affine=False) - self.reset_parameters() - - def reset_parameters(self): - nn.init.kaiming_normal_(self.final_conv_w) - - def forward(self, pprev, prev): - """ - Parameters - --- - pprev: torch.Tensor - the output of the previous previous layer - prev: torch.Tensor - the output of the previous layer - """ - if self.reduction: - pprev, prev = self.reduce0(pprev), self.reduce1(prev) - pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev) - - prev_nodes_out = [pprev_, prev_] - nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device) - for i in range(self.num_nodes): - node_out, mask = self.nodes[i](prev_nodes_out) - nodes_used_mask[:mask.size(0)] |= mask.to(node_out.device) - prev_nodes_out.append(node_out) - - unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1) - unused_nodes = F.relu(unused_nodes) - conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :] - conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1) - out = F.conv2d(unused_nodes, conv_weight) - return prev, self.bn(out) - - -class ENASMacroLayer(mutables.MutableScope): - """ - Builtin ENAS Marco Layer. With search space changing to layer level, the controller decides - what operation is employed and the previous layer to connect to for skip connections. The model - is made up of the same layers but the choice of each layer may be different. - - Parameters - --- - key: str - the name of this layer - prev_labels: str - names of all previous layers - in_filters: int - the number of input channels - out_filters: - the number of output channels - """ - def __init__(self, key, prev_labels, in_filters, out_filters): - super().__init__(key) - self.in_filters = in_filters - self.out_filters = out_filters - self.mutable = mutables.LayerChoice([ - ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False), - ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True), - ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False), - ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True), - PoolBranch('avg', in_filters, out_filters, 3, 1, 1), - PoolBranch('max', in_filters, out_filters, 3, 1, 1) - ]) - if prev_labels: - self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None) - else: - self.skipconnect = None - self.batch_norm = nn.BatchNorm2d(out_filters, affine=False) - - def forward(self, prev_list): - """ - Parameters - --- - prev_list: list - The cell selects the last element of the list as input and applies an operation on it. - The cell chooses none/one/multiple tensor(s) as SkipConnect(s) from the list excluding - the last element. - """ - out = self.mutable(prev_list[-1]) - if self.skipconnect is not None: - connection = self.skipconnect(prev_list[:-1]) - if connection is not None: - out += connection - return self.batch_norm(out) - - -class ENASMacroGeneralModel(nn.Module): - """ - The network is made up by stacking ENASMacroLayer. The Macro search space contains these layers. - Each layer chooses an operation from predefined ones and SkipConnect then forms a network. - - Parameters - --- - num_layers: int - The number of layers contained in the network. - out_filters: int - The number of each layer's output channels. - in_channel: int - The number of input's channels. - num_classes: int - The number of classes for classification. - dropout_rate: float - Dropout layer's dropout rate before the final dense layer. - """ - def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10, - dropout_rate=0.0): - super().__init__() - self.num_layers = num_layers - self.num_classes = num_classes - self.out_filters = out_filters - - self.stem = nn.Sequential( - nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False), - nn.BatchNorm2d(out_filters) - ) - - pool_distance = self.num_layers // 3 - self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1] - self.dropout_rate = dropout_rate - self.dropout = nn.Dropout(self.dropout_rate) - - self.layers = nn.ModuleList() - self.pool_layers = nn.ModuleList() - labels = [] - for layer_id in range(self.num_layers): - labels.append("layer_{}".format(layer_id)) - if layer_id in self.pool_layers_idx: - self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters)) - self.layers.append(ENASMacroLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters)) - - self.gap = nn.AdaptiveAvgPool2d(1) - self.dense = nn.Linear(self.out_filters, self.num_classes) - - def forward(self, x): - """ - Parameters - --- - x: torch.Tensor - the input of the network - """ - bs = x.size(0) - cur = self.stem(x) - - layers = [cur] - - for layer_id in range(self.num_layers): - cur = self.layers[layer_id](layers) - layers.append(cur) - if layer_id in self.pool_layers_idx: - for i, layer in enumerate(layers): - layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer) - cur = layers[-1] - - cur = self.gap(cur).view(bs, -1) - cur = self.dropout(cur) - logits = self.dense(cur) - return logits diff --git a/nni/nas/pytorch/search_space_zoo/enas_ops.py b/nni/nas/pytorch/search_space_zoo/enas_ops.py deleted file mode 100644 index 21ecc2da7..000000000 --- a/nni/nas/pytorch/search_space_zoo/enas_ops.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import torch -import torch.nn as nn - - -class StdConv(nn.Module): - def __init__(self, C_in, C_out): - super(StdConv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(C_out, affine=False), - nn.ReLU() - ) - - def forward(self, x): - return self.conv(x) - - -class PoolBranch(nn.Module): - """ - Pooling structure for Macro search. First pass through a 1x1 Conv, then pooling operation followed by BatchNorm2d. - - Parameters - --- - pool_type: str - only accept ``max`` for MaxPool and ``avg`` for AvgPool - C_in: int - the number of input channels - C_out: int - the number of output channels - kernal_size: int - size of the convolving kernel - stride: int - stride of the convolution - padding: int - zero-padding added to both sides of the input - """ - def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False): - super().__init__() - self.preproc = StdConv(C_in, C_out) - self.pool = Pool(pool_type, kernel_size, stride, padding) - self.bn = nn.BatchNorm2d(C_out, affine=affine) - - def forward(self, x): - out = self.preproc(x) - out = self.pool(out) - out = self.bn(out) - return out - - -class SeparableConv(nn.Module): - def __init__(self, C_in, C_out, kernel_size, stride, padding): - super(SeparableConv, self).__init__() - self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride, - groups=C_in, bias=False) - self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False) - - def forward(self, x): - out = self.depthwise(x) - out = self.pointwise(out) - return out - - -class ConvBranch(nn.Module): - """ - Conv structure for Macro search. First pass through a 1x1 Conv, - then Conv operation with kernal_size equals 3 or 5 followed by BatchNorm and ReLU. - - Parameters - --- - C_in: int - the number of input channels - C_out: int - the number of output channels - kernal_size: int - size of the convolving kernel - stride: int - stride of the convolution - padding: int - zero-padding added to both sides of the input - separable: True - is separable Conv is used - """ - def __init__(self, C_in, C_out, kernel_size, stride, padding, separable): - super(ConvBranch, self).__init__() - self.preproc = StdConv(C_in, C_out) - if separable: - self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding) - else: - self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding) - self.postproc = nn.Sequential( - nn.BatchNorm2d(C_out, affine=False), - nn.ReLU() - ) - - def forward(self, x): - out = self.preproc(x) - out = self.conv(out) - out = self.postproc(out) - return out - - -class FactorizedReduce(nn.Module): - def __init__(self, C_in, C_out, affine=False): - super().__init__() - self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) - self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) - self.bn = nn.BatchNorm2d(C_out, affine=affine) - - def forward(self, x): - out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1) - out = self.bn(out) - return out - - -class Pool(nn.Module): - """ - Pooling structure - - Parameters - --- - pool_type: str - only accept ``max`` for MaxPool and ``avg`` for AvgPool - kernal_size: int - size of the convolving kernel - stride: int - stride of the convolution - padding: int - zero-padding added to both sides of the input - """ - def __init__(self, pool_type, kernel_size, stride, padding): - super().__init__() - if pool_type.lower() == 'max': - self.pool = nn.MaxPool2d(kernel_size, stride, padding) - elif pool_type.lower() == 'avg': - self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False) - else: - raise ValueError() - - def forward(self, x): - return self.pool(x) - - -class SepConvBN(nn.Module): - """ - Implement SepConv followed by BatchNorm. The structure is ReLU ==> SepConv ==> BN. - - Parameters - --- - C_in: int - the number of imput channels - C_out: int - the number of output channels - kernal_size: int - size of the convolving kernel - padding: int - zero-padding added to both sides of the input - """ - def __init__(self, C_in, C_out, kernel_size, padding): - super().__init__() - self.relu = nn.ReLU() - self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding) - self.bn = nn.BatchNorm2d(C_out, affine=True) - - def forward(self, x): - x = self.relu(x) - x = self.conv(x) - x = self.bn(x) - return x diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py deleted file mode 100644 index 6a3881177..000000000 --- a/nni/nas/pytorch/trainer.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import logging -import os -import time -from abc import abstractmethod - -import torch - -from .base_trainer import BaseTrainer - -_logger = logging.getLogger(__name__) - - -class TorchTensorEncoder(json.JSONEncoder): - def default(self, o): # pylint: disable=method-hidden - if isinstance(o, torch.Tensor): - olist = o.tolist() - if "bool" not in o.type().lower() and all(map(lambda d: d == 0 or d == 1, olist)): - _logger.warning("Every element in %s is either 0 or 1. " - "You might consider convert it into bool.", olist) - return olist - return super().default(o) - - -class Trainer(BaseTrainer): - """ - A trainer with some helper functions implemented. To implement a new trainer, - users need to implement :meth:`train_one_epoch`, :meth:`validate_one_epoch` and :meth:`checkpoint`. - - Parameters - ---------- - model : nn.Module - Model with mutables. - mutator : BaseMutator - A mutator object that has been initialized with the model. - loss : callable - Called with logits and targets. Returns a loss tensor. - See `PyTorch loss functions`_ for examples. - metrics : callable - Called with logits and targets. Returns a dict that maps metrics keys to metrics data. For example, - - .. code-block:: python - - def metrics_fn(output, target): - return {"acc1": accuracy(output, target, topk=1), "acc5": accuracy(output, target, topk=5)} - - optimizer : Optimizer - Optimizer that optimizes the model. - num_epochs : int - Number of epochs of training. - dataset_train : torch.utils.data.Dataset - Dataset of training. If not otherwise specified, ``dataset_train`` and ``dataset_valid`` should be standard - PyTorch Dataset. See `torch.utils.data`_ for examples. - dataset_valid : torch.utils.data.Dataset - Dataset of validation/testing. - batch_size : int - Batch size. - workers : int - Number of workers used in data preprocessing. - device : torch.device - Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will - automatic detects GPU and selects GPU first. - log_frequency : int - Number of mini-batches to log metrics. - callbacks : list of Callback - Callbacks to plug into the trainer. See Callbacks. - - - .. _`PyTorch loss functions`: https://pytorch.org/docs/stable/nn.html#loss-functions - .. _`torch.utils.data`: https://pytorch.org/docs/stable/data.html - """ - def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, - dataset_train, dataset_valid, batch_size, workers, device, log_frequency, callbacks): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device - self.model = model - self.mutator = mutator - self.loss = loss - - self.metrics = metrics - self.optimizer = optimizer - - self.model.to(self.device) - self.mutator.to(self.device) - self.loss.to(self.device) - - self.num_epochs = num_epochs - self.dataset_train = dataset_train - self.dataset_valid = dataset_valid - self.batch_size = batch_size - self.workers = workers - self.log_frequency = log_frequency - self.log_dir = os.path.join("logs", str(time.time())) - os.makedirs(self.log_dir, exist_ok=True) - self.status_writer = open(os.path.join(self.log_dir, "log"), "w") - self.callbacks = callbacks if callbacks is not None else [] - for callback in self.callbacks: - callback.build(self.model, self.mutator, self) - - @abstractmethod - def train_one_epoch(self, epoch): - """ - Train one epoch. - - Parameters - ---------- - epoch : int - Epoch number starting from 0. - """ - pass - - @abstractmethod - def validate_one_epoch(self, epoch): - """ - Validate one epoch. - - Parameters - ---------- - epoch : int - Epoch number starting from 0. - """ - pass - - def train(self, validate=True): - """ - Train ``num_epochs``. - Trigger callbacks at the start and the end of each epoch. - - Parameters - ---------- - validate : bool - If ``true``, will do validation every epoch. - """ - for epoch in range(self.num_epochs): - for callback in self.callbacks: - callback.on_epoch_begin(epoch) - - # training - _logger.info("Epoch %d Training", epoch + 1) - self.train_one_epoch(epoch) - - if validate: - # validation - _logger.info("Epoch %d Validating", epoch + 1) - self.validate_one_epoch(epoch) - - for callback in self.callbacks: - callback.on_epoch_end(epoch) - - def validate(self): - """ - Do one validation. - """ - self.validate_one_epoch(-1) - - def export(self, file): - """ - Call ``mutator.export()`` and dump the architecture to ``file``. - - Parameters - ---------- - file : str - A file path. Expected to be a JSON. - """ - mutator_export = self.mutator.export() - with open(file, "w") as f: - json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder) - - def checkpoint(self): - """ - Return trainer checkpoint. - """ - raise NotImplementedError("Not implemented yet") - - def enable_visualization(self): - """ - Enable visualization. Write graph and training log to folder ``logs/``. - """ - sample = None - for x, _ in self.train_loader: - sample = x.to(self.device)[:2] - break - if sample is None: - _logger.warning("Sample is %s.", sample) - _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir) - with open(os.path.join(self.log_dir, "graph.json"), "w") as f: - json.dump(self.mutator.graph(sample), f) - self.visualization_enabled = True - - def _write_graph_status(self): - if hasattr(self, "visualization_enabled") and self.visualization_enabled: - print(json.dumps(self.mutator.status()), file=self.status_writer, flush=True) diff --git a/nni/nas/pytorch/utils.py b/nni/nas/pytorch/utils.py deleted file mode 100644 index a3f5aabfb..000000000 --- a/nni/nas/pytorch/utils.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -from collections import OrderedDict - -import numpy as np -import torch - -_counter = 0 - -_logger = logging.getLogger(__name__) - - -def global_mutable_counting(): - """ - A program level counter starting from 1. - """ - global _counter - _counter += 1 - return _counter - - -def _reset_global_mutable_counting(): - """ - Reset the global mutable counting to count from 1. Useful when defining multiple models with default keys. - """ - global _counter - _counter = 0 - - -def to_device(obj, device): - """ - Move a tensor, tuple, list, or dict onto device. - """ - if torch.is_tensor(obj): - return obj.to(device) - if isinstance(obj, tuple): - return tuple(to_device(t, device) for t in obj) - if isinstance(obj, list): - return [to_device(t, device) for t in obj] - if isinstance(obj, dict): - return {k: to_device(v, device) for k, v in obj.items()} - if isinstance(obj, (int, float, str)): - return obj - raise ValueError("'%s' has unsupported type '%s'" % (obj, type(obj))) - - -def to_list(arr): - if torch.is_tensor(arr): - return arr.cpu().numpy().tolist() - if isinstance(arr, np.ndarray): - return arr.tolist() - if isinstance(arr, (list, tuple)): - return list(arr) - return arr - - -class AverageMeterGroup: - """ - Average meter group for multiple average meters. - """ - - def __init__(self): - self.meters = OrderedDict() - - def update(self, data): - """ - Update the meter group with a dict of metrics. - Non-exist average meters will be automatically created. - """ - for k, v in data.items(): - if k not in self.meters: - self.meters[k] = AverageMeter(k, ":4f") - self.meters[k].update(v) - - def __getattr__(self, item): - return self.meters[item] - - def __getitem__(self, item): - return self.meters[item] - - def __str__(self): - return " ".join(str(v) for v in self.meters.values()) - - def summary(self): - """ - Return a summary string of group data. - """ - return " ".join(v.summary() for v in self.meters.values()) - - -class AverageMeter: - """ - Computes and stores the average and current value. - - Parameters - ---------- - name : str - Name to display. - fmt : str - Format string to print the values. - """ - - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - """ - Reset the meter. - """ - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - """ - Update with value and weight. - - Parameters - ---------- - val : float or int - The new value to be accounted in. - n : int - The weight of the new value. - """ - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - def summary(self): - fmtstr = '{name}: {avg' + self.fmt + '}' - return fmtstr.format(**self.__dict__) - - -class StructuredMutableTreeNode: - """ - A structured representation of a search space. - A search space comes with a root (with `None` stored in its `mutable`), and a bunch of children in its `children`. - This tree can be seen as a "flattened" version of the module tree. Since nested mutable entity is not supported yet, - the following must be true: each subtree corresponds to a ``MutableScope`` and each leaf corresponds to a - ``Mutable`` (other than ``MutableScope``). - - Parameters - ---------- - mutable : nni.nas.pytorch.mutables.Mutable - The mutable that current node is linked with. - """ - - def __init__(self, mutable): - self.mutable = mutable - self.children = [] - - def add_child(self, mutable): - """ - Add a tree node to the children list of current node. - """ - self.children.append(StructuredMutableTreeNode(mutable)) - return self.children[-1] - - def type(self): - """ - Return the ``type`` of mutable content. - """ - return type(self.mutable) - - def __iter__(self): - return self.traverse() - - def traverse(self, order="pre", deduplicate=True, memo=None): - """ - Return a generator that generates a list of mutables in this tree. - - Parameters - ---------- - order : str - pre or post. If pre, current mutable is yield before children. Otherwise after. - deduplicate : bool - If true, mutables with the same key will not appear after the first appearance. - memo : dict - An auxiliary dict that memorize keys seen before, so that deduplication is possible. - - Returns - ------- - generator of Mutable - """ - if memo is None: - memo = set() - assert order in ["pre", "post"] - if order == "pre": - if self.mutable is not None: - if not deduplicate or self.mutable.key not in memo: - memo.add(self.mutable.key) - yield self.mutable - for child in self.children: - for m in child.traverse(order=order, deduplicate=deduplicate, memo=memo): - yield m - if order == "post": - if self.mutable is not None: - if not deduplicate or self.mutable.key not in memo: - memo.add(self.mutable.key) - yield self.mutable diff --git a/nni/retiarii/strategy/base.py b/nni/nas/strategy/base.py similarity index 100% rename from nni/retiarii/strategy/base.py rename to nni/nas/strategy/base.py diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/nas/strategy/bruteforce.py similarity index 100% rename from nni/retiarii/strategy/bruteforce.py rename to nni/nas/strategy/bruteforce.py diff --git a/nni/retiarii/strategy/local_debug_strategy.py b/nni/nas/strategy/debug.py similarity index 100% rename from nni/retiarii/strategy/local_debug_strategy.py rename to nni/nas/strategy/debug.py diff --git a/nni/retiarii/strategy/evolution.py b/nni/nas/strategy/evolution.py similarity index 100% rename from nni/retiarii/strategy/evolution.py rename to nni/nas/strategy/evolution.py diff --git a/nni/retiarii/strategy/tpe_strategy.py b/nni/nas/strategy/hpo.py similarity index 100% rename from nni/retiarii/strategy/tpe_strategy.py rename to nni/nas/strategy/hpo.py diff --git a/nni/retiarii/strategy/oneshot.py b/nni/nas/strategy/oneshot.py similarity index 100% rename from nni/retiarii/strategy/oneshot.py rename to nni/nas/strategy/oneshot.py diff --git a/nni/retiarii/strategy/rl.py b/nni/nas/strategy/rl.py similarity index 100% rename from nni/retiarii/strategy/rl.py rename to nni/nas/strategy/rl.py diff --git a/nni/retiarii/strategy/utils.py b/nni/nas/strategy/utils.py similarity index 100% rename from nni/retiarii/strategy/utils.py rename to nni/nas/strategy/utils.py diff --git a/nni/nas/tensorflow/__init__.py b/nni/nas/tensorflow/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/nni/nas/tensorflow/base_mutator.py b/nni/nas/tensorflow/base_mutator.py deleted file mode 100644 index 860680f19..000000000 --- a/nni/nas/tensorflow/base_mutator.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from tensorflow.keras import Model - -from .mutables import Mutable, MutableScope, InputChoice -from .utils import StructuredMutableTreeNode - - -class BaseMutator(Model): - def __init__(self, model): - super().__init__() - self.__dict__['model'] = model - self._structured_mutables = self._parse_search_space(self.model) - - def _parse_search_space(self, module, root=None, prefix='', memo=None, nested_detection=None): - if memo is None: - memo = set() - if root is None: - root = StructuredMutableTreeNode(None) - if module not in memo: - memo.add(module) - if isinstance(module, Mutable): - if nested_detection is not None: - raise RuntimeError('Cannot have nested search space. Error at {} in {}' - .format(module, nested_detection)) - module.name = prefix - module.set_mutator(self) - root = root.add_child(module) - if not isinstance(module, MutableScope): - nested_detection = module - if isinstance(module, InputChoice): - for k in module.choose_from: - if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]: - raise RuntimeError('"{}" required by "{}" not found in keys that appeared before, and is not NO_KEY.' - .format(k, module.key)) - for submodule in module.layers: - if not isinstance(submodule, Model): - continue - submodule_prefix = prefix + ('.' if prefix else '') + submodule.name - self._parse_search_space(submodule, root, submodule_prefix, memo=memo, nested_detection=nested_detection) - return root - - @property - def mutables(self): - return self._structured_mutables - - def undedup_mutables(self): - return self._structured_mutables.traverse(deduplicate=False) - - def call(self, *inputs): - raise RuntimeError('Call is undefined for mutators.') - - def __setattr__(self, name, value): - if name == 'model': - raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to " - "include your network, as it will include all parameters in model into the mutator.") - return super().__setattr__(name, value) - - def enter_mutable_scope(self, mutable_scope): - pass - - def exit_mutable_scope(self, mutable_scope): - pass - - def on_forward_layer_choice(self, mutable, *inputs): - raise NotImplementedError - - def on_forward_input_choice(self, mutable, tensor_list): - raise NotImplementedError - - def export(self): - raise NotImplementedError diff --git a/nni/nas/tensorflow/mutables.py b/nni/nas/tensorflow/mutables.py deleted file mode 100644 index 06183a34c..000000000 --- a/nni/nas/tensorflow/mutables.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -from collections import OrderedDict - -from tensorflow.keras import Model - -from .utils import global_mutable_counting - - -_logger = logging.getLogger(__name__) - - -class Mutable(Model): - def __init__(self, key=None): - super().__init__() - if key is None: - self._key = '{}_{}'.format(type(self).__name__, global_mutable_counting()) - elif isinstance(key, str): - self._key = key - else: - self._key = str(key) - _logger.warning('Key "%s" is not string, converted to string.', key) - self.init_hook = None - self.forward_hook = None - - def __deepcopy__(self, memodict=None): - raise NotImplementedError("Deep copy doesn't work for mutables.") - - def set_mutator(self, mutator): - if hasattr(self, 'mutator'): - raise RuntimeError('`set_mutator is called more than once. ' - 'Did you parse the search space multiple times? ' - 'Or did you apply multiple fixed architectures?') - self.mutator = mutator - - def call(self, *inputs): - raise NotImplementedError('Method `call` of Mutable must be overridden') - - def build(self, input_shape): - self._check_built() - - @property - def key(self): - return self._key - - @property - def name(self): - return self._name if hasattr(self, '_name') else self._key - - @name.setter - def name(self, name): - self._name = name - - def _check_built(self): - if not hasattr(self, 'mutator'): - raise ValueError( - "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. " - "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` " - "so that trainer can locate all your mutables. See NNI docs for more details.".format(self)) - - def __repr__(self): - return '{} ({})'.format(self.name, self.key) - - -class MutableScope(Mutable): - def __call__(self, *args, **kwargs): - try: - self.mutator.enter_mutable_scope(self) - return super().__call__(*args, **kwargs) - finally: - self.mutator.exit_mutable_scope(self) - - -class LayerChoice(Mutable): - def __init__(self, op_candidates, reduction='sum', return_mask=False, key=None): - super().__init__(key=key) - self.names = [] - if isinstance(op_candidates, OrderedDict): - for name in op_candidates: - assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \ - "Please don't use a reserved name '{}' for your module.".format(name) - self.names.append(name) - elif isinstance(op_candidates, list): - for i, _ in enumerate(op_candidates): - self.names.append(str(i)) - else: - raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates))) - - self.length = len(op_candidates) - self.choices = op_candidates - self.reduction = reduction - self.return_mask = return_mask - - def call(self, *inputs): - out, mask = self.mutator.on_forward_layer_choice(self, *inputs) - if self.return_mask: - return out, mask - return out - - def build(self, input_shape): - self._check_built() - for op in self.choices: - op.build(input_shape) - - def __len__(self): - return len(self.choices) - - -class InputChoice(Mutable): - NO_KEY = '' - - def __init__(self, n_candidates=None, choose_from=None, n_chosen=None, reduction='sum', return_mask=False, key=None): - super().__init__(key=key) - assert n_candidates is not None or choose_from is not None, \ - 'At least one of `n_candidates` and `choose_from` must be not None.' - if choose_from is not None and n_candidates is None: - n_candidates = len(choose_from) - elif choose_from is None and n_candidates is not None: - choose_from = [self.NO_KEY] * n_candidates - assert n_candidates == len(choose_from), 'Number of candidates must be equal to the length of `choose_from`.' - assert n_candidates > 0, 'Number of candidates must be greater than 0.' - assert n_chosen is None or 0 <= n_chosen <= n_candidates, \ - 'Expected selected number must be None or no more than number of candidates.' - - self.n_candidates = n_candidates - self.choose_from = choose_from.copy() - self.n_chosen = n_chosen - self.reduction = reduction - self.return_mask = return_mask - - def call(self, optional_inputs): - optional_input_list = optional_inputs - if isinstance(optional_inputs, dict): - optional_input_list = [optional_inputs[tag] for tag in self.choose_from] - assert isinstance(optional_input_list, list), \ - 'Optional input list must be a list, not a {}.'.format(type(optional_input_list)) - assert len(optional_inputs) == self.n_candidates, \ - 'Length of the input list must be equal to number of candidates.' - out, mask = self.mutator.on_forward_input_choice(self, optional_input_list) - if self.return_mask: - return out, mask - return out diff --git a/nni/nas/tensorflow/mutator.py b/nni/nas/tensorflow/mutator.py deleted file mode 100644 index b0d2aed68..000000000 --- a/nni/nas/tensorflow/mutator.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging - -import tensorflow as tf - -from .base_mutator import BaseMutator - - -_logger = logging.getLogger(__name__) - - -class Mutator(BaseMutator): - def __init__(self, model): - super().__init__(model) - self._cache = {} - - def sample_search(self): - raise NotImplementedError('Method `sample_search` must be overridden') - - def sample_final(self): - raise NotImplementedError('Method `sample_final` must be overriden for exporting') - - def reset(self): - self._cache = self.sample_search() - - def export(self): - return self.sample_final() - - # TODO: status - # TODO: graph - - def on_forward_layer_choice(self, mutable, *inputs): - mask = self._get_decision(mutable) - assert len(mask) == len(mutable), \ - 'Invalid mask, expected {} to be of length {}.'.format(mask, len(mutable)) - out = self._select_with_mask(lambda choice: choice(*inputs), mutable.choices, mask) - return self._tensor_reduction(mutable.reduction, out), mask - - def on_forward_input_choice(self, mutable, tensor_list): - mask = self._get_decision(mutable) - assert len(mask) == mutable.n_candidates, \ - 'Invalid mask, expected {} to be of length {}.'.format(mask, mutable.n_candidates) - out = self._select_with_mask(lambda tensor: tensor, tensor_list, mask) - return self._tensor_reduction(mutable.reduction, out), mask - - def _select_with_mask(self, map_fn, candidates, mask): - if mask.dtype.is_bool: - out = [map_fn(cand) for cand, m in zip(candidates, mask) if m] - elif mask.dtype.is_floating: - out = [map_fn(cand) * m for cand, m in zip(candidates, mask) if m] - else: - raise ValueError('Unrecognized mask, dtype is {}'.format(mask.dtype.name)) - return out - - def _tensor_reduction(self, reduction_type, tensor_list): - if reduction_type == 'none': - return tensor_list - if not tensor_list: - return None - if len(tensor_list) == 1: - return tensor_list[0] - if reduction_type == 'sum': - return sum(tensor_list) - if reduction_type == 'mean': - return sum(tensor_list) / len(tensor_list) - if reduction_type == 'concat': - image_data_format = tf.keras.backend.image_data_format() - if image_data_format == "channels_first": - axis = 0 - else: - axis = -1 - return tf.concat(tensor_list, axis=axis) # pylint: disable=E1120,E1123 - # pylint issue #3613 - raise ValueError('Unrecognized reduction policy: "{}'.format(reduction_type)) - - def _get_decision(self, mutable): - if mutable.key not in self._cache: - raise ValueError('"{}" not found in decision cache.'.format(mutable.key)) - result = self._cache[mutable.key] - _logger.debug('Decision %s: %s', mutable.key, result) - return result diff --git a/nni/nas/tensorflow/utils.py b/nni/nas/tensorflow/utils.py deleted file mode 100644 index 0cfc6e815..000000000 --- a/nni/nas/tensorflow/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import tensorflow as tf - -_counter = 0 - -def global_mutable_counting(): - global _counter - _counter += 1 - return _counter - - -class AverageMeter: - def __init__(self, name): - self.name = name - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val): - self.val = val - self.sum += val - self.count += 1 - self.avg = self.sum / self.count - - def __str__(self): - return '{name} {val:4f} ({avg:4f})'.format(**self.__dict__) - - def summary(self): - return '{name}: {avg:4f}'.format(**self.__dict__) - - -class AverageMeterGroup: - def __init__(self): - self.meters = {} - - def update(self, data): - for k, v in data.items(): - if k not in self.meters: - self.meters[k] = AverageMeter(k) - self.meters[k].update(v) - - def __str__(self): - return ' '.join(str(v) for v in self.meters.values()) - - def summary(self): - return ' '.join(v.summary() for v in self.meters.values()) - - -class StructuredMutableTreeNode: - def __init__(self, mutable): - self.mutable = mutable - self.children = [] - - def add_child(self, mutable): - self.children.append(StructuredMutableTreeNode(mutable)) - return self.children[-1] - - def type(self): - return type(self.mutable) - - def __iter__(self): - return self.traverse() - - def traverse(self, order="pre", deduplicate=True, memo=None): - if memo is None: - memo = set() - assert order in ["pre", "post"] - if order == "pre": - if self.mutable is not None: - if not deduplicate or self.mutable.key not in memo: - memo.add(self.mutable.key) - yield self.mutable - for child in self.children: - for m in child.traverse(order=order, deduplicate=deduplicate, memo=memo): - yield m - if order == "post": - if self.mutable is not None: - if not deduplicate or self.mutable.key not in memo: - memo.add(self.mutable.key) - yield self.mutable - - -def fill_zero_grads(grads, weights): - ret = [] - for grad, weight in zip(grads, weights): - if grad is not None: - ret.append(grad) - else: - ret.append(tf.zeros_like(weight)) - return ret diff --git a/nni/retiarii/utils.py b/nni/nas/utils/misc.py similarity index 100% rename from nni/retiarii/utils.py rename to nni/nas/utils/misc.py diff --git a/nni/retiarii/serializer.py b/nni/nas/utils/serializer.py similarity index 100% rename from nni/retiarii/serializer.py rename to nni/nas/utils/serializer.py diff --git a/nni/retiarii/oneshot/pytorch/enas.py b/nni/retiarii/oneshot/pytorch/enas.py index 15382589f..75362161c 100644 --- a/nni/retiarii/oneshot/pytorch/enas.py +++ b/nni/retiarii/oneshot/pytorch/enas.py @@ -18,148 +18,6 @@ from .utils import AverageMeterGroup, replace_layer_choice, replace_input_choice _logger = logging.getLogger(__name__) -class StackedLSTMCell(nn.Module): - def __init__(self, layers, size, bias): - super().__init__() - self.lstm_num_layers = layers - self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias) - for _ in range(self.lstm_num_layers)]) - - def forward(self, inputs, hidden): - prev_h, prev_c = hidden - next_h, next_c = [], [] - for i, m in enumerate(self.lstm_modules): - curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i])) - next_c.append(curr_c) - next_h.append(curr_h) - # current implementation only supports batch size equals 1, - # but the algorithm does not necessarily have this limitation - inputs = curr_h[-1].view(1, -1) - return next_h, next_c - - -class ReinforceField: - """ - A field with ``name``, with ``total`` choices. ``choose_one`` is true if one and only one is meant to be - selected. Otherwise, any number of choices can be chosen. - """ - - def __init__(self, name, total, choose_one): - self.name = name - self.total = total - self.choose_one = choose_one - - def __repr__(self): - return f'ReinforceField(name={self.name}, total={self.total}, choose_one={self.choose_one})' - - -class ReinforceController(nn.Module): - """ - A controller that mutates the graph with RL. - - Parameters - ---------- - fields : list of ReinforceField - List of fields to choose. - lstm_size : int - Controller LSTM hidden units. - lstm_num_layers : int - Number of layers for stacked LSTM. - tanh_constant : float - Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``. - skip_target : float - Target probability that skipconnect (chosen by InputChoice) will appear. - If the chosen number of inputs is away from the ``skip_connect``, there will be - a sample skip penalty which is a KL divergence added. - temperature : float - Temperature constant that divides the logits. - entropy_reduction : str - Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced. - """ - - def __init__(self, fields, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, - skip_target=0.4, temperature=None, entropy_reduction='sum'): - super(ReinforceController, self).__init__() - self.fields = fields - self.lstm_size = lstm_size - self.lstm_num_layers = lstm_num_layers - self.tanh_constant = tanh_constant - self.temperature = temperature - self.skip_target = skip_target - - self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False) - self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False) - self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False) - self.v_attn = nn.Linear(self.lstm_size, 1, bias=False) - self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1) - self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), # pylint: disable=not-callable - requires_grad=False) - assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.' - self.entropy_reduction = torch.sum if entropy_reduction == 'sum' else torch.mean - self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') - self.soft = nn.ModuleDict({ - field.name: nn.Linear(self.lstm_size, field.total, bias=False) for field in fields - }) - self.embedding = nn.ModuleDict({ - field.name: nn.Embedding(field.total, self.lstm_size) for field in fields - }) - - def resample(self): - self._initialize() - result = dict() - for field in self.fields: - result[field.name] = self._sample_single(field) - return result - - def _initialize(self): - self._inputs = self.g_emb.data - self._c = [torch.zeros((1, self.lstm_size), - dtype=self._inputs.dtype, - device=self._inputs.device) for _ in range(self.lstm_num_layers)] - self._h = [torch.zeros((1, self.lstm_size), - dtype=self._inputs.dtype, - device=self._inputs.device) for _ in range(self.lstm_num_layers)] - self.sample_log_prob: torch.Tensor = cast(torch.Tensor, 0) - self.sample_entropy: torch.Tensor = cast(torch.Tensor, 0) - self.sample_skip_penalty: torch.Tensor = cast(torch.Tensor, 0) - - def _lstm_next_step(self): - self._h, self._c = self.lstm(self._inputs, (self._h, self._c)) - - def _sample_single(self, field): - self._lstm_next_step() - logit = self.soft[field.name](self._h[-1]) - if self.temperature is not None: - logit /= self.temperature - if self.tanh_constant is not None: - logit = self.tanh_constant * torch.tanh(logit) - if field.choose_one: - sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) - log_prob = self.cross_entropy_loss(logit, sampled) - self._inputs = self.embedding[field.name](sampled) - else: - logit = logit.view(-1, 1) - logit = torch.cat([-logit, logit], 1) # pylint: disable=invalid-unary-operand-type - sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) - skip_prob = torch.sigmoid(logit) - kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets)) - self.sample_skip_penalty += kl - log_prob = self.cross_entropy_loss(logit, sampled) - sampled = sampled.nonzero().view(-1) - if sampled.sum().item(): - self._inputs = (torch.sum(self.embedding[field.name](sampled.view(-1)), 0) / (1. + torch.sum(sampled))).unsqueeze(0) - else: - self._inputs = torch.zeros(1, self.lstm_size, device=self.embedding[field.name].weight.device) # type: ignore - - sampled = sampled.detach().cpu().numpy().tolist() - self.sample_log_prob += self.entropy_reduction(log_prob) - entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type - self.sample_entropy += self.entropy_reduction(entropy) - if len(sampled) == 1: - sampled = sampled[0] - return sampled - - class EnasTrainer(BaseOneShotTrainer): """ ENAS trainer.