diff --git a/nni/algorithms/nas/__init__.py b/nni/algorithms/nas/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/nni/algorithms/nas/pytorch/__init__.py b/nni/algorithms/nas/pytorch/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/nni/algorithms/nas/pytorch/cdarts/__init__.py b/nni/algorithms/nas/pytorch/cdarts/__init__.py
deleted file mode 100644
index ab34902e0..000000000
--- a/nni/algorithms/nas/pytorch/cdarts/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator
-from .trainer import CdartsTrainer
diff --git a/nni/algorithms/nas/pytorch/cdarts/mutator.py b/nni/algorithms/nas/pytorch/cdarts/mutator.py
deleted file mode 100644
index a0bf79040..000000000
--- a/nni/algorithms/nas/pytorch/cdarts/mutator.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-
-from apex.parallel import DistributedDataParallel  # pylint: disable=import-error
-from nni.algorithms.nas.pytorch.darts import DartsMutator  # pylint: disable=wrong-import-order
-from nni.nas.pytorch.mutables import LayerChoice  # pylint: disable=wrong-import-order
-from nni.nas.pytorch.mutator import Mutator  # pylint: disable=wrong-import-order
-
-
-class RegularizedDartsMutator(DartsMutator):
-    """
-    This is :class:`~nni.algorithms.nas.pytorch.darts.DartsMutator` basically, with two differences.
-
-    1. Choices can be cut (bypassed). This is done by ``cut_choices``. Cutted choices will not be used in
-    forward pass and thus consumes no memory.
-
-    2. Regularization on choices, to prevent the mutator from overfitting on some choices.
-    """
-
-    def reset(self):
-        """
-        Warnings
-        --------
-        Renamed :func:`~reset_with_loss` to return regularization loss on reset.
-        """
-        raise ValueError("You should probably call `reset_with_loss`.")
-
-    def cut_choices(self, cut_num=2):
-        """
-        Cut the choices with the smallest weights.
-        ``cut_num`` should be the accumulative number of cutting, e.g., if first time cutting
-        is 2, the second time should be 4 to cut another two.
-
-        Parameters
-        ----------
-        cut_num : int
-            Number of choices to cut, so far.
-
-        Warnings
-        --------
-        Though the parameters are set to :math:`-\infty` to be bypassed, they will still receive gradient of 0,
-        which introduced ``nan`` problem when calling ``optimizer.step()``. To solve this issue, a simple way is to
-        reset nan to :math:`-\infty` each time after the parameters are updated.
-        """
-        # `cut_choices` is implemented but not used in current implementation of CdartsTrainer
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                _, idx = torch.topk(-self.choices[mutable.key], cut_num)
-                with torch.no_grad():
-                    for i in idx:
-                        self.choices[mutable.key][i] = -float("inf")
-
-    def reset_with_loss(self):
-        """
-        Resample and return loss. If loss is 0, to avoid device issue, it will return ``None``.
-
-        Currently loss penalty are proportional to the L1-norm of parameters corresponding
-        to modules if their type name contains certain substrings. These substrings include: ``poolwithoutbn``,
-        ``identity``, ``dilconv``.
-        """
-        self._cache, reg_loss = self.sample_search()
-        return reg_loss
-
-    def sample_search(self):
-        result = super().sample_search()
-        loss = []
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                def need_reg(choice):
-                    return any(t in str(type(choice)).lower() for t in ["poolwithoutbn", "identity", "dilconv"])
-
-                for i, choice in enumerate(mutable.choices):
-                    if need_reg(choice):
-                        norm = torch.abs(self.choices[mutable.key][i])
-                        if norm < 1E10:
-                            loss.append(norm)
-        if not loss:
-            return result, None
-        return result, sum(loss)
-
-    def export(self, logger=None):
-        """
-        Export an architecture with logger. Genotype will be printed with logger.
-
-        Returns
-        -------
-        dict
-            A mapping from mutable keys to decisions.
-        """
-        result = self.sample_final()
-        if hasattr(self.model, "plot_genotype") and logger is not None:
-            genotypes = self.model.plot_genotype(result, logger)
-        return result, genotypes
-
-
-class RegularizedMutatorParallel(DistributedDataParallel):
-    """
-    Parallelize :class:`~RegularizedDartsMutator`.
-
-    This makes :func:`~RegularizedDartsMutator.reset_with_loss` method parallelized,
-    also allowing :func:`~RegularizedDartsMutator.cut_choices` and :func:`~RegularizedDartsMutator.export`
-    to be easily accessible.
-    """
-    def reset_with_loss(self):
-        """
-        Parallelized :func:`~RegularizedDartsMutator.reset_with_loss`.
-        """
-        result = self.module.reset_with_loss()
-        self.callback_queued = False
-        return result
-
-    def cut_choices(self, *args, **kwargs):
-        """
-        Parallelized :func:`~RegularizedDartsMutator.cut_choices`.
-        """
-        self.module.cut_choices(*args, **kwargs)
-
-    def export(self, logger):
-        """
-        Parallelized :func:`~RegularizedDartsMutator.export`.
-        """
-        return self.module.export(logger)
-
-
-class DartsDiscreteMutator(Mutator):
-    """
-    A mutator that applies the final sampling result of a parent mutator on another model to train.
-
-    Parameters
-    ----------
-    model : nn.Module
-        The model to apply the mutator.
-    parent_mutator : nni.nas.pytorch.mutator.Mutator
-        The mutator that provides ``sample_final`` method, that will be called to get the architecture.
-    """
-    def __init__(self, model, parent_mutator):
-        super().__init__(model)
-        self.__dict__["parent_mutator"] = parent_mutator  # avoid parameters to be included
-
-    def sample_search(self):
-        return self.parent_mutator.sample_final()
diff --git a/nni/algorithms/nas/pytorch/cdarts/trainer.py b/nni/algorithms/nas/pytorch/cdarts/trainer.py
deleted file mode 100644
index 1a5174216..000000000
--- a/nni/algorithms/nas/pytorch/cdarts/trainer.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-import os
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import apex  # pylint: disable=import-error
-from apex.parallel import DistributedDataParallel  # pylint: disable=import-error
-from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator  # pylint: disable=wrong-import-order
-from nni.nas.pytorch.utils import AverageMeterGroup  # pylint: disable=wrong-import-order
-
-from .utils import CyclicIterator, TorchTensorEncoder, accuracy, reduce_metrics
-
-PHASE_SMALL = "small"
-PHASE_LARGE = "large"
-
-
-class InteractiveKLLoss(nn.Module):
-    def __init__(self, temperature):
-        super().__init__()
-        self.temperature = temperature
-        # self.kl_loss = nn.KLDivLoss(reduction = 'batchmean')
-        self.kl_loss = nn.KLDivLoss()
-
-    def forward(self, student, teacher):
-        return self.kl_loss(F.log_softmax(student / self.temperature, dim=1),
-                            F.softmax(teacher / self.temperature, dim=1))
-
-
-class CdartsTrainer(object):
-    """
-    CDARTS trainer.
-
-    Parameters
-    ----------
-    model_small : nn.Module
-        PyTorch model to be trained. This is the search network of CDARTS.
-    model_large : nn.Module
-        PyTorch model to be trained. This is the evaluation network of CDARTS.
-    criterion : callable
-        Receives logits and ground truth label, return a loss tensor, e.g., ``nn.CrossEntropyLoss()``.
-    loaders : list of torch.utils.data.DataLoader
-        List of train data and valid data loaders, for training weights and architecture weights respectively.
-    samplers : list of torch.utils.data.Sampler
-        List of train data and valid data samplers. This can be PyTorch standard samplers if not distributed.
-        In distributed mode, sampler needs to have ``set_epoch`` method. Refer to data utils in CDARTS example for details.
-    logger : logging.Logger
-        The logger for logging. Will use nni logger by default (if logger is ``None``).
-    regular_coeff : float
-        The coefficient of regular loss.
-    regular_ratio : float
-        The ratio of regular loss.
-    warmup_epochs : int
-        The epochs to warmup the search network
-    fix_head : bool
-        ``True`` if fixing the paramters of auxiliary heads, else unfix the paramters of auxiliary heads.
-    epochs : int
-        Number of epochs planned for training.
-    steps_per_epoch : int
-        Steps of one epoch.
-    loss_alpha : float
-        The loss coefficient.
-    loss_T : float
-        The loss coefficient.
-    distributed : bool
-        ``True`` if using distributed training, else non-distributed training.
-    log_frequency : int
-        Step count per logging.
-    grad_clip : float
-        Gradient clipping for weights.
-    interactive_type : string
-        ``kl`` or ``smoothl1``.
-    output_path : string
-        Log storage path.
-    w_lr : float
-        Learning rate of the search network parameters.
-    w_momentum : float
-        Momentum of the search and the evaluation network.
-    w_weight_decay : float
-        The weight decay the search and the evaluation network parameters.
-    alpha_lr : float
-        Learning rate of the architecture parameters.
-    alpha_weight_decay : float
-        The weight decay the architecture parameters.
-    nasnet_lr : float
-        Learning rate of the evaluation network parameters.
-    local_rank : int
-        The number of thread.
-    share_module : bool
-        ``True`` if sharing the stem and auxiliary heads, else not sharing these modules.
-    """
-    def __init__(self, model_small, model_large, criterion, loaders, samplers, logger=None,
-                 regular_coeff=5, regular_ratio=0.2, warmup_epochs=2, fix_head=True,
-                 epochs=32, steps_per_epoch=None, loss_alpha=2, loss_T=2, distributed=True,
-                 log_frequency=10, grad_clip=5.0, interactive_type='kl', output_path='./outputs',
-                 w_lr=0.2, w_momentum=0.9, w_weight_decay=3e-4, alpha_lr=0.2, alpha_weight_decay=1e-4,
-                 nasnet_lr=0.2, local_rank=0, share_module=True):
-        if logger is None:
-            logger = logging.getLogger(__name__)
-        train_loader, valid_loader = loaders
-        train_sampler, valid_sampler = samplers
-        self.train_loader = CyclicIterator(train_loader, train_sampler, distributed)
-        self.valid_loader = CyclicIterator(valid_loader, valid_sampler, distributed)
-
-        self.regular_coeff = regular_coeff
-        self.regular_ratio = regular_ratio
-        self.warmup_epochs = warmup_epochs
-        self.fix_head = fix_head
-        self.epochs = epochs
-        self.steps_per_epoch = steps_per_epoch
-        if self.steps_per_epoch is None:
-            self.steps_per_epoch = min(len(self.train_loader), len(self.valid_loader))
-        self.loss_alpha = loss_alpha
-        self.grad_clip = grad_clip
-        if interactive_type == "kl":
-            self.interactive_loss = InteractiveKLLoss(loss_T)
-        elif interactive_type == "smoothl1":
-            self.interactive_loss = nn.SmoothL1Loss()
-        self.loss_T = loss_T
-        self.distributed = distributed
-        self.log_frequency = log_frequency
-        self.main_proc = not distributed or local_rank == 0
-
-        self.logger = logger
-        self.checkpoint_dir = output_path
-        if self.main_proc:
-            os.makedirs(self.checkpoint_dir, exist_ok=True)
-        if distributed:
-            torch.distributed.barrier()
-
-        self.model_small = model_small
-        self.model_large = model_large
-        if self.fix_head:
-            for param in self.model_small.aux_head.parameters():
-                param.requires_grad = False
-            for param in self.model_large.aux_head.parameters():
-                param.requires_grad = False
-
-        self.mutator_small = RegularizedDartsMutator(self.model_small).cuda()
-        self.mutator_large = DartsDiscreteMutator(self.model_large, self.mutator_small).cuda()
-        self.criterion = criterion
-
-        self.optimizer_small = torch.optim.SGD(self.model_small.parameters(), w_lr,
-                                               momentum=w_momentum, weight_decay=w_weight_decay)
-        self.optimizer_large = torch.optim.SGD(self.model_large.parameters(), nasnet_lr,
-                                               momentum=w_momentum, weight_decay=w_weight_decay)
-        self.optimizer_alpha = torch.optim.Adam(self.mutator_small.parameters(), alpha_lr,
-                                                betas=(0.5, 0.999), weight_decay=alpha_weight_decay)
-
-        if distributed:
-            apex.parallel.convert_syncbn_model(self.model_small)
-            apex.parallel.convert_syncbn_model(self.model_large)
-            self.model_small = DistributedDataParallel(self.model_small, delay_allreduce=True)
-            self.model_large = DistributedDataParallel(self.model_large, delay_allreduce=True)
-            self.mutator_small = RegularizedMutatorParallel(self.mutator_small, delay_allreduce=True)
-            if share_module:
-                self.model_small.callback_queued = True
-                self.model_large.callback_queued = True
-            # mutator large never gets optimized, so do not need parallelized
-
-    def _warmup(self, phase, epoch):
-        assert phase in [PHASE_SMALL, PHASE_LARGE]
-        if phase == PHASE_SMALL:
-            model, optimizer = self.model_small, self.optimizer_small
-        elif phase == PHASE_LARGE:
-            model, optimizer = self.model_large, self.optimizer_large
-        model.train()
-        meters = AverageMeterGroup()
-        for step in range(self.steps_per_epoch):
-            x, y = next(self.train_loader)
-            x, y = x.cuda(), y.cuda()
-
-            optimizer.zero_grad()
-            logits_main, _ = model(x)
-            loss = self.criterion(logits_main, y)
-            loss.backward()
-
-            self._clip_grad_norm(model)
-            optimizer.step()
-            prec1, prec5 = accuracy(logits_main, y, topk=(1, 5))
-            metrics = {"prec1": prec1, "prec5": prec5, "loss": loss}
-            metrics = reduce_metrics(metrics, self.distributed)
-            meters.update(metrics)
-            if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch):
-                self.logger.info("Epoch [%d/%d] Step [%d/%d] (%s)  %s", epoch + 1, self.epochs,
-                                 step + 1, self.steps_per_epoch, phase, meters)
-
-    def _clip_grad_norm(self, model):
-        if isinstance(model, DistributedDataParallel):
-            nn.utils.clip_grad_norm_(model.module.parameters(), self.grad_clip)
-        else:
-            nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip)
-
-    def _reset_nan(self, parameters):
-        with torch.no_grad():
-            for param in parameters:
-                for i, p in enumerate(param):
-                    if p != p:  # equivalent to `isnan(p)`
-                        param[i] = float("-inf")
-
-    def _joint_train(self, epoch):
-        self.model_large.train()
-        self.model_small.train()
-        meters = AverageMeterGroup()
-        for step in range(self.steps_per_epoch):
-            trn_x, trn_y = next(self.train_loader)
-            val_x, val_y = next(self.valid_loader)
-            trn_x, trn_y = trn_x.cuda(), trn_y.cuda()
-            val_x, val_y = val_x.cuda(), val_y.cuda()
-
-            # step 1. optimize architecture
-            self.optimizer_alpha.zero_grad()
-            self.optimizer_large.zero_grad()
-            reg_decay = max(self.regular_coeff * (1 - float(epoch - self.warmup_epochs) / (
-                (self.epochs - self.warmup_epochs) * self.regular_ratio)), 0)
-            loss_regular = self.mutator_small.reset_with_loss()
-            if loss_regular:
-                loss_regular *= reg_decay
-            logits_search, emsemble_logits_search = self.model_small(val_x)
-            logits_main, emsemble_logits_main = self.model_large(val_x)
-            loss_cls = (self.criterion(logits_search, val_y) + self.criterion(logits_main, val_y)) / self.loss_alpha
-            loss_interactive = self.interactive_loss(emsemble_logits_search, emsemble_logits_main) * (self.loss_T ** 2) * self.loss_alpha
-            loss = loss_cls + loss_interactive + loss_regular
-            loss.backward()
-            self._clip_grad_norm(self.model_large)
-            self.optimizer_large.step()
-            self.optimizer_alpha.step()
-            # NOTE: need to call here `self._reset_nan(self.mutator_small.parameters())` if `cut_choices`
-
-            # step 2. optimize op weights
-            self.optimizer_small.zero_grad()
-            with torch.no_grad():
-                # resample architecture since parameters have been changed
-                self.mutator_small.reset_with_loss()
-            logits_search_train, _ = self.model_small(trn_x)
-            loss_weight = self.criterion(logits_search_train, trn_y)
-            loss_weight.backward()
-            self._clip_grad_norm(self.model_small)
-            self.optimizer_small.step()
-
-            metrics = {"loss_cls": loss_cls, "loss_interactive": loss_interactive,
-                       "loss_regular": loss_regular, "loss_weight": loss_weight}
-            metrics = reduce_metrics(metrics, self.distributed)
-            meters.update(metrics)
-
-            if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch):
-                self.logger.info("Epoch [%d/%d] Step [%d/%d] (joint)  %s", epoch + 1, self.epochs,
-                                 step + 1, self.steps_per_epoch, meters)
-
-    def train(self):
-        for epoch in range(self.epochs):
-            if epoch < self.warmup_epochs:
-                with torch.no_grad():  # otherwise grads will be retained on the architecture params
-                    self.mutator_small.reset_with_loss()
-                self._warmup(PHASE_SMALL, epoch)
-            else:
-                with torch.no_grad():
-                    self.mutator_large.reset()
-                self._warmup(PHASE_LARGE, epoch)
-                self._joint_train(epoch)
-
-            self.export(os.path.join(self.checkpoint_dir, "epoch_{:02d}.json".format(epoch)),
-                        os.path.join(self.checkpoint_dir, "epoch_{:02d}.genotypes".format(epoch)))
-
-    def export(self, file, genotype_file):
-        if self.main_proc:
-            mutator_export, genotypes = self.mutator_small.export(self.logger)
-            with open(file, "w") as f:
-                json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-            with open(genotype_file, "w") as f:
-                f.write(str(genotypes))
diff --git a/nni/algorithms/nas/pytorch/cdarts/utils.py b/nni/algorithms/nas/pytorch/cdarts/utils.py
deleted file mode 100644
index 96afa9425..000000000
--- a/nni/algorithms/nas/pytorch/cdarts/utils.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import os
-
-import torch
-import torch.distributed as dist
-
-
-class CyclicIterator:
-    def __init__(self, loader, sampler, distributed):
-        self.loader = loader
-        self.sampler = sampler
-        self.epoch = 0
-        self.distributed = distributed
-        self._next_epoch()
-
-    def _next_epoch(self):
-        if self.distributed:
-            self.sampler.set_epoch(self.epoch)
-        self.iterator = iter(self.loader)
-        self.epoch += 1
-
-    def __len__(self):
-        return len(self.loader)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        try:
-            return next(self.iterator)
-        except StopIteration:
-            self._next_epoch()
-            return next(self.iterator)
-
-
-class TorchTensorEncoder(json.JSONEncoder):
-    def default(self, o):  # pylint: disable=method-hidden
-        if isinstance(o, torch.Tensor):
-            return o.tolist()
-        return super().default(o)
-
-
-def accuracy(output, target, topk=(1,)):
-    """ Computes the precision@k for the specified values of k """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    # one-hot case
-    if target.ndimension() > 1:
-        target = target.max(1)[1]
-
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].reshape(-1).float().sum(0)
-        res.append(correct_k.mul_(1.0 / batch_size))
-    return res
-
-
-def reduce_tensor(tensor):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= float(os.environ["WORLD_SIZE"])
-    return rt
-
-
-def reduce_metrics(metrics, distributed=False):
-    if distributed:
-        return {k: reduce_tensor(v).item() for k, v in metrics.items()}
-    return {k: v.item() for k, v in metrics.items()}
diff --git a/nni/algorithms/nas/pytorch/classic_nas/__init__.py b/nni/algorithms/nas/pytorch/classic_nas/__init__.py
deleted file mode 100644
index ec3f5a489..000000000
--- a/nni/algorithms/nas/pytorch/classic_nas/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import get_and_apply_next_architecture
diff --git a/nni/algorithms/nas/pytorch/classic_nas/mutator.py b/nni/algorithms/nas/pytorch/classic_nas/mutator.py
deleted file mode 100644
index 7254a8b0b..000000000
--- a/nni/algorithms/nas/pytorch/classic_nas/mutator.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-import os
-import sys
-
-import torch
-
-import nni
-from nni.runtime.env_vars import trial_env_vars
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope
-from nni.nas.pytorch.mutator import Mutator
-
-logger = logging.getLogger(__name__)
-
-NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE"
-LAYER_CHOICE = "layer_choice"
-INPUT_CHOICE = "input_choice"
-
-
-def get_and_apply_next_architecture(model):
-    """
-    Wrapper of :class:`~nni.nas.pytorch.classic_nas.mutator.ClassicMutator` to make it more meaningful,
-    similar to ``get_next_parameter`` for HPO.
-
-    It will generate search space based on ``model``.
-    If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for
-    generating search space for the experiment.
-    If not, there are still two mode, one is nni experiment mode where users
-    use ``nnictl`` to start an experiment. The other is standalone mode
-    where users directly run the trial command, this mode chooses the first
-    one(s) for each LayerChoice and InputChoice.
-
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-    ClassicMutator(model)
-
-
-class ClassicMutator(Mutator):
-    """
-    This mutator is to apply the architecture chosen from tuner.
-    It implements the forward function of LayerChoice and InputChoice,
-    to only activate the chosen ones.
-
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-
-    def __init__(self, model):
-        super(ClassicMutator, self).__init__(model)
-        self._chosen_arch = {}
-        self._search_space = self._generate_search_space()
-        if NNI_GEN_SEARCH_SPACE in os.environ:
-            # dry run for only generating search space
-            self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE])
-            sys.exit(0)
-
-        if trial_env_vars.NNI_PLATFORM is None:
-            logger.warning("This is in standalone mode, the chosen are the first one(s).")
-            self._chosen_arch = self._standalone_generate_chosen()
-        else:
-            # get chosen arch from tuner
-            self._chosen_arch = nni.get_next_parameter()
-            if self._chosen_arch is None:
-                if trial_env_vars.NNI_PLATFORM == "unittest":
-                    # happens if NNI_PLATFORM is intentionally set, e.g., in UT
-                    logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.")
-                    self._chosen_arch = self._standalone_generate_chosen()
-                else:
-                    raise RuntimeError("Chosen architecture is None. This may be a platform error.")
-        self.reset()
-
-    def _sample_layer_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert layer choice to tensor representation.
-
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        # doesn't support multihot for layer choice yet
-        onehot_list = [False] * len(mutable)
-        assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \
-            "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value)
-        onehot_list[idx] = True
-        return torch.tensor(onehot_list, dtype=torch.bool)  # pylint: disable=not-callable
-
-    def _sample_input_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert input choice to tensor representation.
-
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        candidate_repr = search_space_item["candidates"]
-        multihot_list = [False] * mutable.n_candidates
-        for i, v in zip(idx, value):
-            assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
-                "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
-            assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
-            multihot_list[i] = True
-        return torch.tensor(multihot_list, dtype=torch.bool)  # pylint: disable=not-callable
-
-    def sample_search(self):
-        """
-        See :meth:`sample_final`.
-        """
-        return self.sample_final()
-
-    def sample_final(self):
-        """
-        Convert the chosen arch and apply it on model.
-        """
-        assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \
-            "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(),
-                                                                                       self._chosen_arch.keys())
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, (LayerChoice, InputChoice)):
-                assert mutable.key in self._chosen_arch, \
-                    "Expected '{}' in chosen arch, but not found.".format(mutable.key)
-                data = self._chosen_arch[mutable.key]
-                assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
-                    "'{}' is not a valid choice.".format(data)
-            if isinstance(mutable, LayerChoice):
-                result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, InputChoice):
-                result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return result
-
-    def _standalone_generate_chosen(self):
-        """
-        Generate the chosen architecture for standalone mode,
-        i.e., choose the first one(s) for LayerChoice and InputChoice.
-        ::
-            { key_name: {"_value": "conv1",
-                         "_idx": 0} }
-            { key_name: {"_value": ["in1"],
-                         "_idx": [0]} }
-        Returns
-        -------
-        dict
-            the chosen architecture
-        """
-        chosen_arch = {}
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                chosen_arch[key] = {"_value": choices[0], "_idx": 0}
-            elif val["_type"] == INPUT_CHOICE:
-                choices = val["_value"]["candidates"]
-                n_chosen = val["_value"]["n_chosen"]
-                if n_chosen is None:
-                    n_chosen = len(choices)
-                chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
-            else:
-                raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
-        return chosen_arch
-
-    def _generate_search_space(self):
-        """
-        Generate search space from mutables.
-        Here is the search space format:
-        ::
-            { key_name: {"_type": "layer_choice",
-                         "_value": ["conv1", "conv2"]} }
-            { key_name: {"_type": "input_choice",
-                         "_value": {"candidates": ["in1", "in2"],
-                                    "n_chosen": 1}} }
-        Returns
-        -------
-        dict
-            the generated search space
-        """
-        search_space = {}
-        for mutable in self.mutables:
-            # for now we only generate flattened search space
-            if isinstance(mutable, LayerChoice):
-                key = mutable.key
-                val = mutable.names
-                search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
-            elif isinstance(mutable, InputChoice):
-                key = mutable.key
-                search_space[key] = {"_type": INPUT_CHOICE,
-                                     "_value": {"candidates": mutable.choose_from,
-                                                "n_chosen": mutable.n_chosen}}
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return search_space
-
-    def _dump_search_space(self, file_path):
-        with open(file_path, "w") as ss_file:
-            json.dump(self._search_space, ss_file, sort_keys=True, indent=2)
diff --git a/nni/algorithms/nas/pytorch/cream/__init__.py b/nni/algorithms/nas/pytorch/cream/__init__.py
deleted file mode 100755
index 43a038b46..000000000
--- a/nni/algorithms/nas/pytorch/cream/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .trainer import CreamSupernetTrainer
diff --git a/nni/algorithms/nas/pytorch/cream/trainer.py b/nni/algorithms/nas/pytorch/cream/trainer.py
deleted file mode 100644
index b44f40466..000000000
--- a/nni/algorithms/nas/pytorch/cream/trainer.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-from copy import deepcopy
-
-import torch
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup
-
-from .utils import accuracy, reduce_metrics
-
-logger = logging.getLogger(__name__)
-
-
-class CreamSupernetTrainer(Trainer):
-    """
-    This trainer trains a supernet and output prioritized architectures that can be used for other tasks.
-
-    Parameters
-    ----------
-    model : nn.Module
-        Model with mutables.
-    loss : callable
-        Called with logits and targets. Returns a loss tensor.
-    val_loss : callable
-        Called with logits and targets for validation only. Returns a loss tensor.
-    optimizer : Optimizer
-        Optimizer that optimizes the model.
-    num_epochs : int
-        Number of epochs of training.
-    train_loader : iterablez
-        Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
-    valid_loader : iterablez
-        Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
-    mutator : Mutator
-        A mutator object that has been initialized with the model.
-    batch_size : int
-        Batch size.
-    log_frequency : int
-        Number of mini-batches to log metrics.
-    meta_sta_epoch : int
-        start epoch of using meta matching network to pick teacher architecture
-    update_iter : int
-        interval of updating meta matching networks
-    slices : int
-        batch size of mini training data in the process of training meta matching network
-    pool_size : int
-        board size
-    pick_method : basestring
-        how to pick teacher network
-    choice_num : int
-        number of operations in supernet
-    sta_num : int
-        layer number of each stage in supernet (5 stage in supernet)
-    acc_gap : int
-        maximum accuracy improvement to omit the limitation of flops
-    flops_dict : Dict
-        dictionary of each layer's operations in supernet
-    flops_fixed : int
-        flops of fixed part in supernet
-    local_rank : int
-        index of current rank
-    callbacks : list of Callback
-        Callbacks to plug into the trainer. See Callbacks.
-    """
-
-    def __init__(self, model, loss, val_loss,
-                 optimizer, num_epochs, train_loader, valid_loader,
-                 mutator=None, batch_size=64, log_frequency=None,
-                 meta_sta_epoch=20, update_iter=200, slices=2,
-                 pool_size=10, pick_method='meta', choice_num=6,
-                 sta_num=(4, 4, 4, 4, 4), acc_gap=5,
-                 flops_dict=None, flops_fixed=0, local_rank=0, callbacks=None):
-        assert torch.cuda.is_available()
-        super(CreamSupernetTrainer, self).__init__(model, mutator, loss, None,
-                                                   optimizer, num_epochs, None, None,
-                                                   batch_size, None, None, log_frequency, callbacks)
-        self.model = model
-        self.loss = loss
-        self.val_loss = val_loss
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-        self.log_frequency = log_frequency
-        self.batch_size = batch_size
-        self.optimizer = optimizer
-        self.model = model
-        self.loss = loss
-        self.num_epochs = num_epochs
-        self.meta_sta_epoch = meta_sta_epoch
-        self.update_iter = update_iter
-        self.slices = slices
-        self.pick_method = pick_method
-        self.pool_size = pool_size
-        self.local_rank = local_rank
-        self.choice_num = choice_num
-        self.sta_num = sta_num
-        self.acc_gap = acc_gap
-        self.flops_dict = flops_dict
-        self.flops_fixed = flops_fixed
-
-        self.current_student_arch = None
-        self.current_teacher_arch = None
-        self.main_proc = (local_rank == 0)
-        self.current_epoch = 0
-
-        self.prioritized_board = []
-
-    # size of prioritized board
-    def _board_size(self):
-        return len(self.prioritized_board)
-
-    # select teacher architecture according to the logit difference
-    def _select_teacher(self):
-        self._replace_mutator_cand(self.current_student_arch)
-
-        if self.pick_method == 'top1':
-            meta_value, teacher_cand = 0.5, sorted(
-                self.prioritized_board, reverse=True)[0][3]
-        elif self.pick_method == 'meta':
-            meta_value, cand_idx, teacher_cand = -1000000000, -1, None
-            for now_idx, item in enumerate(self.prioritized_board):
-                inputx = item[4]
-                output = torch.nn.functional.softmax(self.model(inputx), dim=1)
-                weight = self.model.module.forward_meta(output - item[5])
-                if weight > meta_value:
-                    meta_value = weight
-                    cand_idx = now_idx
-                    teacher_cand = self.prioritized_board[cand_idx][3]
-            assert teacher_cand is not None
-            meta_value = torch.nn.functional.sigmoid(-weight)
-        else:
-            raise ValueError('Method Not supported')
-
-        return meta_value, teacher_cand
-
-    # check whether to update prioritized board
-    def _isUpdateBoard(self, prec1, flops):
-        if self.current_epoch <= self.meta_sta_epoch:
-            return False
-
-        if len(self.prioritized_board) < self.pool_size:
-            return True
-
-        if prec1 > self.prioritized_board[-1][1] + self.acc_gap:
-            return True
-
-        if prec1 > self.prioritized_board[-1][1] and flops < self.prioritized_board[-1][2]:
-            return True
-
-        return False
-
-    # update prioritized board
-    def _update_prioritized_board(self, inputs, teacher_output, outputs, prec1, flops):
-        if self._isUpdateBoard(prec1, flops):
-            val_prec1 = prec1
-            training_data = deepcopy(inputs[:self.slices].detach())
-            if len(self.prioritized_board) == 0:
-                features = deepcopy(outputs[:self.slices].detach())
-            else:
-                features = deepcopy(
-                    teacher_output[:self.slices].detach())
-            self.prioritized_board.append(
-                (val_prec1,
-                 prec1,
-                 flops,
-                 self.current_student_arch,
-                 training_data,
-                 torch.nn.functional.softmax(
-                     features,
-                     dim=1)))
-            self.prioritized_board = sorted(
-                self.prioritized_board, reverse=True)
-
-        if len(self.prioritized_board) > self.pool_size:
-            del self.prioritized_board[-1]
-
-    # only update student network weights
-    def _update_student_weights_only(self, grad_1):
-        for weight, grad_item in zip(
-                self.model.module.rand_parameters(self.current_student_arch), grad_1):
-            weight.grad = grad_item
-        torch.nn.utils.clip_grad_norm_(
-            self.model.module.rand_parameters(self.current_student_arch), 1)
-        self.optimizer.step()
-        for weight, grad_item in zip(
-                self.model.module.rand_parameters(self.current_student_arch), grad_1):
-            del weight.grad
-
-    # only update meta networks weights
-    def _update_meta_weights_only(self, teacher_cand, grad_teacher):
-        for weight, grad_item in zip(self.model.module.rand_parameters(
-                teacher_cand, self.pick_method == 'meta'), grad_teacher):
-            weight.grad = grad_item
-
-        # clip gradients
-        torch.nn.utils.clip_grad_norm_(
-            self.model.module.rand_parameters(
-                self.current_student_arch, self.pick_method == 'meta'), 1)
-
-        self.optimizer.step()
-        for weight, grad_item in zip(self.model.module.rand_parameters(
-                teacher_cand, self.pick_method == 'meta'), grad_teacher):
-            del weight.grad
-
-    # simulate sgd updating
-    def _simulate_sgd_update(self, w, g, optimizer):
-        return g * optimizer.param_groups[-1]['lr'] + w
-
-    # split training images into several slices
-    def _get_minibatch_input(self, input):  # pylint: disable=redefined-builtin
-        slice = self.slices  # pylint: disable=redefined-builtin
-        x = deepcopy(input[:slice].clone().detach())
-        return x
-
-    # calculate 1st gradient of student architectures
-    def _calculate_1st_gradient(self, kd_loss):
-        self.optimizer.zero_grad()
-        grad = torch.autograd.grad(
-            kd_loss,
-            self.model.module.rand_parameters(self.current_student_arch),
-            create_graph=True)
-        return grad
-
-    # calculate 2nd gradient of meta networks
-    def _calculate_2nd_gradient(self, validation_loss, teacher_cand, students_weight):
-        self.optimizer.zero_grad()
-        grad_student_val = torch.autograd.grad(
-            validation_loss,
-            self.model.module.rand_parameters(self.current_student_arch),
-            retain_graph=True)
-
-        grad_teacher = torch.autograd.grad(
-            students_weight[0],
-            self.model.module.rand_parameters(
-                teacher_cand,
-                self.pick_method == 'meta'),
-            grad_outputs=grad_student_val)
-        return grad_teacher
-
-    # forward training data
-    def _forward_training(self, x, meta_value):
-        self._replace_mutator_cand(self.current_student_arch)
-        output = self.model(x)
-
-        with torch.no_grad():
-            self._replace_mutator_cand(self.current_teacher_arch)
-            teacher_output = self.model(x)
-            soft_label = torch.nn.functional.softmax(teacher_output, dim=1)
-
-        kd_loss = meta_value * \
-            self._cross_entropy_loss_with_soft_target(output, soft_label)
-        return kd_loss
-
-    # calculate soft target loss
-    def _cross_entropy_loss_with_soft_target(self, pred, soft_target):
-        logsoftmax = torch.nn.LogSoftmax()
-        return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))
-
-    # forward validation data
-    def _forward_validation(self, input, target):  # pylint: disable=redefined-builtin
-        slice = self.slices  # pylint: disable=redefined-builtin
-        x = input[slice:slice * 2].clone()
-
-        self._replace_mutator_cand(self.current_student_arch)
-        output_2 = self.model(x)
-
-        validation_loss = self.loss(output_2, target[slice:slice * 2])
-        return validation_loss
-
-    def _isUpdateMeta(self, batch_idx):
-        isUpdate = True
-        isUpdate &= (self.current_epoch > self.meta_sta_epoch)
-        isUpdate &= (batch_idx > 0)
-        isUpdate &= (batch_idx % self.update_iter == 0)
-        isUpdate &= (self._board_size() > 0)
-        return isUpdate
-
-    def _replace_mutator_cand(self, cand):
-        self.mutator._cache = cand
-
-    # update meta matching networks
-    def _run_update(self, input, target, batch_idx):  # pylint: disable=redefined-builtin
-        if self._isUpdateMeta(batch_idx):
-            x = self._get_minibatch_input(input)
-
-            meta_value, teacher_cand = self._select_teacher()
-
-            kd_loss = self._forward_training(x, meta_value)
-
-            # calculate 1st gradient
-            grad_1st = self._calculate_1st_gradient(kd_loss)
-
-            # simulate updated student weights
-            students_weight = [
-                self._simulate_sgd_update(
-                    p, grad_item, self.optimizer) for p, grad_item in zip(
-                    self.model.module.rand_parameters(self.current_student_arch), grad_1st)]
-
-            # update student weights
-            self._update_student_weights_only(grad_1st)
-
-            validation_loss = self._forward_validation(input, target)
-
-            # calculate 2nd gradient
-            grad_teacher = self._calculate_2nd_gradient(validation_loss, teacher_cand, students_weight)
-
-            # update meta matching networks
-            self._update_meta_weights_only(teacher_cand, grad_teacher)
-
-            # delete internal variants
-            del grad_teacher, grad_1st, x, validation_loss, kd_loss, students_weight
-
-    def _get_cand_flops(self, cand):
-        flops = 0
-        for block_id, block in enumerate(cand):
-            if block == 'LayerChoice1' or block_id == 'LayerChoice23':
-                continue
-            for idx, choice in enumerate(cand[block]):
-                flops += self.flops_dict[block_id][idx] * (1 if choice else 0)
-        return flops + self.flops_fixed
-
-    def train_one_epoch(self, epoch):
-        self.current_epoch = epoch
-        meters = AverageMeterGroup()
-        self.steps_per_epoch = len(self.train_loader)
-        for step, (input_data, target) in enumerate(self.train_loader):
-            self.mutator.reset()
-            self.current_student_arch = self.mutator._cache
-
-            input_data, target = input_data.cuda(), target.cuda()
-
-            # calculate flops of current architecture
-            cand_flops = self._get_cand_flops(self.mutator._cache)
-
-            # update meta matching network
-            self._run_update(input_data, target, step)
-
-            if self._board_size() > 0:
-                # select teacher architecture
-                meta_value, teacher_cand = self._select_teacher()
-                self.current_teacher_arch = teacher_cand
-
-            # forward supernet
-            if self._board_size() == 0 or epoch <= self.meta_sta_epoch:
-                self._replace_mutator_cand(self.current_student_arch)
-                output = self.model(input_data)
-
-                loss = self.loss(output, target)
-                kd_loss, teacher_output, teacher_cand = None, None, None
-            else:
-                self._replace_mutator_cand(self.current_student_arch)
-                output = self.model(input_data)
-
-                gt_loss = self.loss(output, target)
-
-                with torch.no_grad():
-                    self._replace_mutator_cand(self.current_teacher_arch)
-                    teacher_output = self.model(input_data).detach()
-
-                    soft_label = torch.nn.functional.softmax(teacher_output, dim=1)
-                kd_loss = self._cross_entropy_loss_with_soft_target(output, soft_label)
-
-                loss = (meta_value * kd_loss + (2 - meta_value) * gt_loss) / 2
-
-            # update network
-            self.optimizer.zero_grad()
-            loss.backward()
-            self.optimizer.step()
-
-            # update metrics
-            prec1, prec5 = accuracy(output, target, topk=(1, 5))
-            metrics = {"prec1": prec1, "prec5": prec5, "loss": loss}
-            metrics = reduce_metrics(metrics)
-            meters.update(metrics)
-
-            # update prioritized board
-            self._update_prioritized_board(input_data, teacher_output, output, metrics['prec1'], cand_flops)
-
-            if self.main_proc and (step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch):
-                logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, self.num_epochs,
-                            step + 1, len(self.train_loader), meters)
-
-        if self.main_proc and self.num_epochs == epoch + 1:
-            for idx, i in enumerate(self.prioritized_board):
-                logger.info("No.%s %s", idx, i[:4])
-
-    def validate_one_epoch(self, epoch):
-        self.model.eval()
-        meters = AverageMeterGroup()
-        with torch.no_grad():
-            for step, (x, y) in enumerate(self.valid_loader):
-                self.mutator.reset()
-                logits = self.model(x)
-                loss = self.val_loss(logits, y)
-                prec1, prec5 = accuracy(logits, y, topk=(1, 5))
-                metrics = {"prec1": prec1, "prec5": prec5, "loss": loss}
-                metrics = reduce_metrics(metrics)
-                meters.update(metrics)
-
-                if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [%s/%s] Validation Step [%s/%s]  %s", epoch + 1,
-                                self.num_epochs, step + 1, len(self.valid_loader), meters)
diff --git a/nni/algorithms/nas/pytorch/cream/utils.py b/nni/algorithms/nas/pytorch/cream/utils.py
deleted file mode 100644
index 7d71faa71..000000000
--- a/nni/algorithms/nas/pytorch/cream/utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-
-import os
-import torch.distributed as dist
-
-
-def accuracy(output, target, topk=(1,)):
-    """ Computes the precision@k for the specified values of k """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    # one-hot case
-    if target.ndimension() > 1:
-        target = target.max(1)[1]
-
-    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res.append(correct_k.mul_(1.0 / batch_size))
-    return res
-
-
-def reduce_metrics(metrics):
-    return {k: reduce_tensor(v).item() for k, v in metrics.items()}
-
-
-def reduce_tensor(tensor):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= float(os.environ["WORLD_SIZE"])
-    return rt
diff --git a/nni/algorithms/nas/pytorch/darts/__init__.py b/nni/algorithms/nas/pytorch/darts/__init__.py
deleted file mode 100644
index 1a22790fb..000000000
--- a/nni/algorithms/nas/pytorch/darts/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import DartsMutator
-from .trainer import DartsTrainer
diff --git a/nni/algorithms/nas/pytorch/darts/mutator.py b/nni/algorithms/nas/pytorch/darts/mutator.py
deleted file mode 100644
index a4c3898a9..000000000
--- a/nni/algorithms/nas/pytorch/darts/mutator.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from nni.nas.pytorch.mutator import Mutator
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice
-
-_logger = logging.getLogger(__name__)
-
-
-class DartsMutator(Mutator):
-    """
-    Connects the model in a DARTS (differentiable) way.
-
-    An extra connection is automatically inserted for each LayerChoice, when this connection is selected, there is no
-    op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false``
-    (not chosen).
-
-    All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based
-    on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice
-    will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0.
-
-    It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the
-    value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will
-    be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully.
-
-    Attributes
-    ----------
-    choices: ParameterDict
-        dict that maps keys of LayerChoices to weighted-connection float tensors.
-    """
-    def __init__(self, model):
-        super().__init__(model)
-        self.choices = nn.ParameterDict()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1))
-
-    def device(self):
-        for v in self.choices.values():
-            return v.device
-
-    def sample_search(self):
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1]
-            elif isinstance(mutable, InputChoice):
-                result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device())
-        return result
-
-    def sample_final(self):
-        result = dict()
-        edges_max = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                max_val, index = torch.max(F.softmax(self.choices[mutable.key], dim=-1)[:-1], 0)
-                edges_max[mutable.key] = max_val
-                result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool()
-        for mutable in self.mutables:
-            if isinstance(mutable, InputChoice):
-                if mutable.n_chosen is not None:
-                    weights = []
-                    for src_key in mutable.choose_from:
-                        if src_key not in edges_max:
-                            _logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key)
-                        weights.append(edges_max.get(src_key, 0.))
-                    weights = torch.tensor(weights)  # pylint: disable=not-callable
-                    _, topk_edge_indices = torch.topk(weights, mutable.n_chosen)
-                    selected_multihot = []
-                    for i, src_key in enumerate(mutable.choose_from):
-                        if i not in topk_edge_indices and src_key in result:
-                            # If an edge is never selected, there is no need to calculate any op on this edge.
-                            # This is to eliminate redundant calculation.
-                            result[src_key] = torch.zeros_like(result[src_key])
-                        selected_multihot.append(i in topk_edge_indices)
-                    result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device())  # pylint: disable=not-callable
-                else:
-                    result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device())  # pylint: disable=not-callable
-        return result
diff --git a/nni/algorithms/nas/pytorch/darts/trainer.py b/nni/algorithms/nas/pytorch/darts/trainer.py
deleted file mode 100644
index e2d8e1866..000000000
--- a/nni/algorithms/nas/pytorch/darts/trainer.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import copy
-import logging
-
-import torch
-import torch.nn as nn
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup
-
-from .mutator import DartsMutator
-
-logger = logging.getLogger(__name__)
-
-
-class DartsTrainer(Trainer):
-    """
-    DARTS trainer.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model to be trained.
-    loss : callable
-        Receives logits and ground truth label, return a loss tensor.
-    metrics : callable
-        Receives logits and ground truth label, return a dict of metrics.
-    optimizer : Optimizer
-        The optimizer used for optimizing the model.
-    num_epochs : int
-        Number of epochs planned for training.
-    dataset_train : Dataset
-        Dataset for training. Will be split for training weights and architecture weights.
-    dataset_valid : Dataset
-        Dataset for testing.
-    mutator : DartsMutator
-        Use in case of customizing your own DartsMutator. By default will instantiate a DartsMutator.
-    batch_size : int
-        Batch size.
-    workers : int
-        Workers for data loading.
-    device : torch.device
-        ``torch.device("cpu")`` or ``torch.device("cuda")``.
-    log_frequency : int
-        Step count per logging.
-    callbacks : list of Callback
-        list of callbacks to trigger at events.
-    arc_learning_rate : float
-        Learning rate of architecture parameters.
-    unrolled : float
-        ``True`` if using second order optimization, else first order optimization.
-    """
-    def __init__(self, model, loss, metrics,
-                 optimizer, num_epochs, dataset_train, dataset_valid,
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
-                 callbacks=None, arc_learning_rate=3.0E-4, unrolled=False):
-        super().__init__(model, mutator if mutator is not None else DartsMutator(model),
-                         loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
-                         batch_size, workers, device, log_frequency, callbacks)
-
-        self.ctrl_optim = torch.optim.Adam(self.mutator.parameters(), arc_learning_rate, betas=(0.5, 0.999),
-                                           weight_decay=1.0E-3)
-        self.unrolled = unrolled
-
-        n_train = len(self.dataset_train)
-        split = n_train // 2
-        indices = list(range(n_train))
-        train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split])
-        valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:])
-        self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
-                                                        batch_size=batch_size,
-                                                        sampler=train_sampler,
-                                                        num_workers=workers)
-        self.valid_loader = torch.utils.data.DataLoader(self.dataset_train,
-                                                        batch_size=batch_size,
-                                                        sampler=valid_sampler,
-                                                        num_workers=workers)
-        self.test_loader = torch.utils.data.DataLoader(self.dataset_valid,
-                                                       batch_size=batch_size,
-                                                       num_workers=workers)
-
-    def train_one_epoch(self, epoch):
-        self.model.train()
-        self.mutator.train()
-        meters = AverageMeterGroup()
-        for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)):
-            trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device)
-            val_X, val_y = val_X.to(self.device), val_y.to(self.device)
-
-            # phase 1. architecture step
-            self.ctrl_optim.zero_grad()
-            if self.unrolled:
-                self._unrolled_backward(trn_X, trn_y, val_X, val_y)
-            else:
-                self._backward(val_X, val_y)
-            self.ctrl_optim.step()
-
-            # phase 2: child network step
-            self.optimizer.zero_grad()
-            logits, loss = self._logits_and_loss(trn_X, trn_y)
-            loss.backward()
-            nn.utils.clip_grad_norm_(self.model.parameters(), 5.)  # gradient clipping
-            self.optimizer.step()
-
-            metrics = self.metrics(logits, trn_y)
-            metrics["loss"] = loss.item()
-            meters.update(metrics)
-            if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch + 1,
-                            self.num_epochs, step + 1, len(self.train_loader), meters)
-
-    def validate_one_epoch(self, epoch):
-        self.model.eval()
-        self.mutator.eval()
-        meters = AverageMeterGroup()
-        with torch.no_grad():
-            self.mutator.reset()
-            for step, (X, y) in enumerate(self.test_loader):
-                X, y = X.to(self.device), y.to(self.device)
-                logits = self.model(X)
-                metrics = self.metrics(logits, y)
-                meters.update(metrics)
-                if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch + 1,
-                                self.num_epochs, step + 1, len(self.test_loader), meters)
-
-    def _logits_and_loss(self, X, y):
-        self.mutator.reset()
-        logits = self.model(X)
-        loss = self.loss(logits, y)
-        self._write_graph_status()
-        return logits, loss
-
-    def _backward(self, val_X, val_y):
-        """
-        Simple backward with gradient descent
-        """
-        _, loss = self._logits_and_loss(val_X, val_y)
-        loss.backward()
-
-    def _unrolled_backward(self, trn_X, trn_y, val_X, val_y):
-        """
-        Compute unrolled loss and backward its gradients
-        """
-        backup_params = copy.deepcopy(tuple(self.model.parameters()))
-
-        # do virtual step on training data
-        lr = self.optimizer.param_groups[0]["lr"]
-        momentum = self.optimizer.param_groups[0]["momentum"]
-        weight_decay = self.optimizer.param_groups[0]["weight_decay"]
-        self._compute_virtual_model(trn_X, trn_y, lr, momentum, weight_decay)
-
-        # calculate unrolled loss on validation data
-        # keep gradients for model here for compute hessian
-        _, loss = self._logits_and_loss(val_X, val_y)
-        w_model, w_ctrl = tuple(self.model.parameters()), tuple(self.mutator.parameters())
-        w_grads = torch.autograd.grad(loss, w_model + w_ctrl)
-        d_model, d_ctrl = w_grads[:len(w_model)], w_grads[len(w_model):]
-
-        # compute hessian and final gradients
-        hessian = self._compute_hessian(backup_params, d_model, trn_X, trn_y)
-        with torch.no_grad():
-            for param, d, h in zip(w_ctrl, d_ctrl, hessian):
-                # gradient = dalpha - lr * hessian
-                param.grad = d - lr * h
-
-        # restore weights
-        self._restore_weights(backup_params)
-
-    def _compute_virtual_model(self, X, y, lr, momentum, weight_decay):
-        """
-        Compute unrolled weights w`
-        """
-        # don't need zero_grad, using autograd to calculate gradients
-        _, loss = self._logits_and_loss(X, y)
-        gradients = torch.autograd.grad(loss, self.model.parameters())
-        with torch.no_grad():
-            for w, g in zip(self.model.parameters(), gradients):
-                m = self.optimizer.state[w].get("momentum_buffer", 0.)
-                w = w - lr * (momentum * m + g + weight_decay * w)
-
-    def _restore_weights(self, backup_params):
-        with torch.no_grad():
-            for param, backup in zip(self.model.parameters(), backup_params):
-                param.copy_(backup)
-
-    def _compute_hessian(self, backup_params, dw, trn_X, trn_y):
-        """
-            dw = dw` { L_val(w`, alpha) }
-            w+ = w + eps * dw
-            w- = w - eps * dw
-            hessian = (dalpha { L_trn(w+, alpha) } - dalpha { L_trn(w-, alpha) }) / (2*eps)
-            eps = 0.01 / ||dw||
-        """
-        self._restore_weights(backup_params)
-        norm = torch.cat([w.view(-1) for w in dw]).norm()
-        eps = 0.01 / norm
-        if norm < 1E-8:
-            logger.warning("In computing hessian, norm is smaller than 1E-8, cause eps to be %.6f.", norm.item())
-
-        dalphas = []
-        for e in [eps, -2. * eps]:
-            # w+ = w + eps*dw`, w- = w - eps*dw`
-            with torch.no_grad():
-                for p, d in zip(self.model.parameters(), dw):
-                    p += e * d
-
-            _, loss = self._logits_and_loss(trn_X, trn_y)
-            dalphas.append(torch.autograd.grad(loss, self.mutator.parameters()))
-
-        dalpha_pos, dalpha_neg = dalphas  # dalpha { L_trn(w+) }, # dalpha { L_trn(w-) }
-        hessian = [(p - n) / (2. * eps) for p, n in zip(dalpha_pos, dalpha_neg)]
-        return hessian
diff --git a/nni/algorithms/nas/pytorch/enas/__init__.py b/nni/algorithms/nas/pytorch/enas/__init__.py
deleted file mode 100644
index d3372836e..000000000
--- a/nni/algorithms/nas/pytorch/enas/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import EnasMutator
-from .trainer import EnasTrainer
diff --git a/nni/algorithms/nas/pytorch/enas/mutator.py b/nni/algorithms/nas/pytorch/enas/mutator.py
deleted file mode 100644
index 7fdba26b9..000000000
--- a/nni/algorithms/nas/pytorch/enas/mutator.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from nni.nas.pytorch.mutator import Mutator
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope
-
-
-class StackedLSTMCell(nn.Module):
-    def __init__(self, layers, size, bias):
-        super().__init__()
-        self.lstm_num_layers = layers
-        self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias)
-                                           for _ in range(self.lstm_num_layers)])
-
-    def forward(self, inputs, hidden):
-        prev_h, prev_c = hidden
-        next_h, next_c = [], []
-        for i, m in enumerate(self.lstm_modules):
-            curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i]))
-            next_c.append(curr_c)
-            next_h.append(curr_h)
-            # current implementation only supports batch size equals 1,
-            # but the algorithm does not necessarily have this limitation
-            inputs = curr_h[-1].view(1, -1)
-        return next_h, next_c
-
-
-class EnasMutator(Mutator):
-    """
-    A mutator that mutates the graph with RL.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model.
-    lstm_size : int
-        Controller LSTM hidden units.
-    lstm_num_layers : int
-        Number of layers for stacked LSTM.
-    tanh_constant : float
-        Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``.
-    cell_exit_extra_step : bool
-        If true, RL controller will perform an extra step at the exit of each MutableScope, dump the hidden state
-        and mark it as the hidden state of this MutableScope. This is to align with the original implementation of paper.
-    skip_target : float
-        Target probability that skipconnect will appear.
-    temperature : float
-        Temperature constant that divides the logits.
-    branch_bias : float
-        Manual bias applied to make some operations more likely to be chosen.
-        Currently this is implemented with a hardcoded match rule that aligns with original repo.
-        If a mutable has a ``reduce`` in its key, all its op choices
-        that contains `conv` in their typename will receive a bias of ``+self.branch_bias`` initially; while others
-        receive a bias of ``-self.branch_bias``.
-    entropy_reduction : str
-        Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced.
-    """
-
-    def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, cell_exit_extra_step=False,
-                 skip_target=0.4, temperature=None, branch_bias=0.25, entropy_reduction="sum"):
-        super().__init__(model)
-        self.lstm_size = lstm_size
-        self.lstm_num_layers = lstm_num_layers
-        self.tanh_constant = tanh_constant
-        self.temperature = temperature
-        self.cell_exit_extra_step = cell_exit_extra_step
-        self.skip_target = skip_target
-        self.branch_bias = branch_bias
-
-        self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False)
-        self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
-        self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
-        self.v_attn = nn.Linear(self.lstm_size, 1, bias=False)
-        self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1)
-        self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), requires_grad=False)  # pylint: disable=not-callable
-        assert entropy_reduction in ["sum", "mean"], "Entropy reduction must be one of sum and mean."
-        self.entropy_reduction = torch.sum if entropy_reduction == "sum" else torch.mean
-        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction="none")
-        self.bias_dict = nn.ParameterDict()
-
-        self.max_layer_choice = 0
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                if self.max_layer_choice == 0:
-                    self.max_layer_choice = len(mutable)
-                assert self.max_layer_choice == len(mutable), \
-                    "ENAS mutator requires all layer choice have the same number of candidates."
-                # We are judging by keys and module types to add biases to layer choices. Needs refactor.
-                if "reduce" in mutable.key:
-                    def is_conv(choice):
-                        return "conv" in str(type(choice)).lower()
-                    bias = torch.tensor([self.branch_bias if is_conv(choice) else -self.branch_bias  # pylint: disable=not-callable
-                                         for choice in mutable])
-                    self.bias_dict[mutable.key] = nn.Parameter(bias, requires_grad=False)
-
-        self.embedding = nn.Embedding(self.max_layer_choice + 1, self.lstm_size)
-        self.soft = nn.Linear(self.lstm_size, self.max_layer_choice, bias=False)
-
-    def sample_search(self):
-        self._initialize()
-        self._sample(self.mutables)
-        return self._choices
-
-    def sample_final(self):
-        return self.sample_search()
-
-    def _sample(self, tree):
-        mutable = tree.mutable
-        if isinstance(mutable, LayerChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_layer_choice(mutable)
-        elif isinstance(mutable, InputChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_input_choice(mutable)
-        for child in tree.children:
-            self._sample(child)
-        if isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid:
-            if self.cell_exit_extra_step:
-                self._lstm_next_step()
-            self._mark_anchor(mutable.key)
-
-    def _initialize(self):
-        self._choices = dict()
-        self._anchors_hid = dict()
-        self._inputs = self.g_emb.data
-        self._c = [torch.zeros((1, self.lstm_size),
-                               dtype=self._inputs.dtype,
-                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
-        self._h = [torch.zeros((1, self.lstm_size),
-                               dtype=self._inputs.dtype,
-                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-
-    def _lstm_next_step(self):
-        self._h, self._c = self.lstm(self._inputs, (self._h, self._c))
-
-    def _mark_anchor(self, key):
-        self._anchors_hid[key] = self._h[-1]
-
-    def _sample_layer_choice(self, mutable):
-        self._lstm_next_step()
-        logit = self.soft(self._h[-1])
-        if self.temperature is not None:
-            logit /= self.temperature
-        if self.tanh_constant is not None:
-            logit = self.tanh_constant * torch.tanh(logit)
-        if mutable.key in self.bias_dict:
-            logit += self.bias_dict[mutable.key]
-        branch_id = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
-        log_prob = self.cross_entropy_loss(logit, branch_id)
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = (log_prob * torch.exp(-log_prob)).detach()  # pylint: disable=invalid-unary-operand-type
-        self.sample_entropy += self.entropy_reduction(entropy)
-        self._inputs = self.embedding(branch_id)
-        return F.one_hot(branch_id, num_classes=self.max_layer_choice).bool().view(-1)
-
-    def _sample_input_choice(self, mutable):
-        query, anchors = [], []
-        for label in mutable.choose_from:
-            if label not in self._anchors_hid:
-                self._lstm_next_step()
-                self._mark_anchor(label)  # empty loop, fill not found
-            query.append(self.attn_anchor(self._anchors_hid[label]))
-            anchors.append(self._anchors_hid[label])
-        query = torch.cat(query, 0)
-        query = torch.tanh(query + self.attn_query(self._h[-1]))
-        query = self.v_attn(query)
-        if self.temperature is not None:
-            query /= self.temperature
-        if self.tanh_constant is not None:
-            query = self.tanh_constant * torch.tanh(query)
-
-        if mutable.n_chosen is None:
-            logit = torch.cat([-query, query], 1)  # pylint: disable=invalid-unary-operand-type
-
-            skip = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
-            skip_prob = torch.sigmoid(logit)
-            kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets))
-            self.sample_skip_penalty += kl
-            log_prob = self.cross_entropy_loss(logit, skip)
-            self._inputs = (torch.matmul(skip.float(), torch.cat(anchors, 0)) / (1. + torch.sum(skip))).unsqueeze(0)
-        else:
-            assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS."
-            logit = query.view(1, -1)
-            index = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
-            skip = F.one_hot(index, num_classes=mutable.n_candidates).view(-1)
-            log_prob = self.cross_entropy_loss(logit, index)
-            self._inputs = anchors[index.item()]
-
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = (log_prob * torch.exp(-log_prob)).detach()  # pylint: disable=invalid-unary-operand-type
-        self.sample_entropy += self.entropy_reduction(entropy)
-        return skip.bool()
diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py
deleted file mode 100644
index 5e7a96658..000000000
--- a/nni/algorithms/nas/pytorch/enas/trainer.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-from itertools import cycle
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup, to_device
-from .mutator import EnasMutator
-
-logger = logging.getLogger(__name__)
-
-
-class EnasTrainer(Trainer):
-    """
-    ENAS trainer.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model to be trained.
-    loss : callable
-        Receives logits and ground truth label, return a loss tensor.
-    metrics : callable
-        Receives logits and ground truth label, return a dict of metrics.
-    reward_function : callable
-        Receives logits and ground truth label, return a tensor, which will be feeded to RL controller as reward.
-    optimizer : Optimizer
-        The optimizer used for optimizing the model.
-    num_epochs : int
-        Number of epochs planned for training.
-    dataset_train : Dataset
-        Dataset for training. Will be split for training weights and architecture weights.
-    dataset_valid : Dataset
-        Dataset for testing.
-    mutator : EnasMutator
-        Use when customizing your own mutator or a mutator with customized parameters.
-    batch_size : int
-        Batch size.
-    workers : int
-        Workers for data loading.
-    device : torch.device
-        ``torch.device("cpu")`` or ``torch.device("cuda")``.
-    log_frequency : int
-        Step count per logging.
-    callbacks : list of Callback
-        list of callbacks to trigger at events.
-    entropy_weight : float
-        Weight of sample entropy loss.
-    skip_weight : float
-        Weight of skip penalty loss.
-    baseline_decay : float
-        Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``.
-    child_steps : int
-        How many mini-batches for model training per epoch.
-    mutator_lr : float
-        Learning rate for RL controller.
-    mutator_steps_aggregate : int
-        Number of steps that will be aggregated into one mini-batch for RL controller.
-    mutator_steps : int
-        Number of mini-batches for each epoch of RL controller learning.
-    aux_weight : float
-        Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss.
-    test_arc_per_epoch : int
-        How many architectures are chosen for direct test after each epoch.
-    """
-    def __init__(self, model, loss, metrics, reward_function,
-                 optimizer, num_epochs, dataset_train, dataset_valid,
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None,
-                 entropy_weight=0.0001, skip_weight=0.8, baseline_decay=0.999, child_steps=500,
-                 mutator_lr=0.00035, mutator_steps_aggregate=20, mutator_steps=50, aux_weight=0.4,
-                 test_arc_per_epoch=1):
-        super().__init__(model, mutator if mutator is not None else EnasMutator(model),
-                         loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
-                         batch_size, workers, device, log_frequency, callbacks)
-        self.reward_function = reward_function
-        self.mutator_optim = optim.Adam(self.mutator.parameters(), lr=mutator_lr)
-        self.batch_size = batch_size
-        self.workers = workers
-
-        self.entropy_weight = entropy_weight
-        self.skip_weight = skip_weight
-        self.baseline_decay = baseline_decay
-        self.baseline = 0.
-        self.mutator_steps_aggregate = mutator_steps_aggregate
-        self.mutator_steps = mutator_steps
-        self.child_steps = child_steps
-        self.aux_weight = aux_weight
-        self.test_arc_per_epoch = test_arc_per_epoch
-
-        self.init_dataloader()
-
-    def init_dataloader(self):
-        n_train = len(self.dataset_train)
-        split = n_train // 10
-        indices = list(range(n_train))
-        train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split])
-        valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:])
-        self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
-                                                        batch_size=self.batch_size,
-                                                        sampler=train_sampler,
-                                                        num_workers=self.workers)
-        self.valid_loader = torch.utils.data.DataLoader(self.dataset_train,
-                                                        batch_size=self.batch_size,
-                                                        sampler=valid_sampler,
-                                                        num_workers=self.workers)
-        self.test_loader = torch.utils.data.DataLoader(self.dataset_valid,
-                                                       batch_size=self.batch_size,
-                                                       num_workers=self.workers)
-        self.train_loader = cycle(self.train_loader)
-        self.valid_loader = cycle(self.valid_loader)
-
-    def train_one_epoch(self, epoch):
-        # Sample model and train
-        self.model.train()
-        self.mutator.eval()
-        meters = AverageMeterGroup()
-        for step in range(1, self.child_steps + 1):
-            x, y = next(self.train_loader)
-            x, y = to_device(x, self.device), to_device(y, self.device)
-            self.optimizer.zero_grad()
-
-            with torch.no_grad():
-                self.mutator.reset()
-            self._write_graph_status()
-            logits = self.model(x)
-
-            if isinstance(logits, tuple):
-                logits, aux_logits = logits
-                aux_loss = self.loss(aux_logits, y)
-            else:
-                aux_loss = 0.
-            metrics = self.metrics(logits, y)
-            loss = self.loss(logits, y)
-            loss = loss + self.aux_weight * aux_loss
-            loss.backward()
-            nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
-            self.optimizer.step()
-            metrics["loss"] = loss.item()
-            meters.update(metrics)
-
-            if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Model Epoch [%d/%d] Step [%d/%d]  %s", epoch + 1,
-                            self.num_epochs, step, self.child_steps, meters)
-
-        # Train sampler (mutator)
-        self.model.eval()
-        self.mutator.train()
-        meters = AverageMeterGroup()
-        for mutator_step in range(1, self.mutator_steps + 1):
-            self.mutator_optim.zero_grad()
-            for step in range(1, self.mutator_steps_aggregate + 1):
-                x, y = next(self.valid_loader)
-                x, y = to_device(x, self.device), to_device(y, self.device)
-
-                self.mutator.reset()
-                with torch.no_grad():
-                    logits = self.model(x)
-                self._write_graph_status()
-                metrics = self.metrics(logits, y)
-                reward = self.reward_function(logits, y)
-                if self.entropy_weight:
-                    reward += self.entropy_weight * self.mutator.sample_entropy.item()
-                self.baseline = self.baseline * self.baseline_decay + reward * (1 - self.baseline_decay)
-                loss = self.mutator.sample_log_prob * (reward - self.baseline)
-                if self.skip_weight:
-                    loss += self.skip_weight * self.mutator.sample_skip_penalty
-                metrics["reward"] = reward
-                metrics["loss"] = loss.item()
-                metrics["ent"] = self.mutator.sample_entropy.item()
-                metrics["log_prob"] = self.mutator.sample_log_prob.item()
-                metrics["baseline"] = self.baseline
-                metrics["skip"] = self.mutator.sample_skip_penalty
-
-                loss /= self.mutator_steps_aggregate
-                loss.backward()
-                meters.update(metrics)
-
-                cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate
-                if self.log_frequency is not None and cur_step % self.log_frequency == 0:
-                    logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d]  %s", epoch + 1, self.num_epochs,
-                                mutator_step, self.mutator_steps, step, self.mutator_steps_aggregate,
-                                meters)
-
-            nn.utils.clip_grad_norm_(self.mutator.parameters(), 5.)
-            self.mutator_optim.step()
-
-    def validate_one_epoch(self, epoch):
-        with torch.no_grad():
-            for arc_id in range(self.test_arc_per_epoch):
-                meters = AverageMeterGroup()
-                for x, y in self.test_loader:
-                    x, y = to_device(x, self.device), to_device(y, self.device)
-                    self.mutator.reset()
-                    logits = self.model(x)
-                    if isinstance(logits, tuple):
-                        logits, _ = logits
-                    metrics = self.metrics(logits, y)
-                    loss = self.loss(logits, y)
-                    metrics["loss"] = loss.item()
-                    meters.update(metrics)
-
-                logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary  %s",
-                            epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch,
-                            meters.summary())
diff --git a/nni/algorithms/nas/pytorch/fbnet/__init__.py b/nni/algorithms/nas/pytorch/fbnet/__init__.py
deleted file mode 100644
index 38d96327e..000000000
--- a/nni/algorithms/nas/pytorch/fbnet/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import
-
-from .mutator import FBNetMutator  # noqa: F401
-from .trainer import FBNetTrainer  # noqa: F401
-from .utils import (  # noqa: F401
-    LookUpTable,
-    NASConfig,
-    RegularizerLoss,
-    model_init,
-    supernet_sample,
-)
diff --git a/nni/algorithms/nas/pytorch/fbnet/mutator.py b/nni/algorithms/nas/pytorch/fbnet/mutator.py
deleted file mode 100644
index 914063b82..000000000
--- a/nni/algorithms/nas/pytorch/fbnet/mutator.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-import numpy as np
-
-from nni.nas.pytorch.base_mutator import BaseMutator
-from nni.nas.pytorch.mutables import LayerChoice
-
-
-class MixedOp(nn.Module):
-    """
-    This class is to instantiate and manage info of one LayerChoice.
-    It includes architecture weights and member functions for the weights.
-    """
-
-    def __init__(self, mutable, latency):
-        """
-        Parameters
-        ----------
-        mutable : LayerChoice
-            A LayerChoice in user model
-        latency : List
-            performance cost for each op in mutable
-        """
-        super(MixedOp, self).__init__()
-        self.latency = latency
-        n_choices = len(mutable)
-        self.path_alpha = nn.Parameter(
-            torch.FloatTensor([1.0 / n_choices for i in range(n_choices)])
-        )
-        self.path_alpha.requires_grad = False
-        self.temperature = 1.0
-
-    def get_path_alpha(self):
-        """Return the architecture parameter."""
-        return self.path_alpha
-
-    def get_weighted_latency(self):
-        """Return the weighted perf_cost of current mutable."""
-        soft_masks = self.probs_over_ops()
-        weighted_latency = sum(m * l for m, l in zip(soft_masks, self.latency))
-        return weighted_latency
-
-    def set_temperature(self, temperature):
-        """
-        Set the annealed temperature for gumbel softmax.
-
-        Parameters
-        ----------
-        temperature : float
-            The annealed temperature for gumbel softmax
-        """
-        self.temperature = temperature
-
-    def to_requires_grad(self):
-        """Enable gradient calculation."""
-        self.path_alpha.requires_grad = True
-
-    def to_disable_grad(self):
-        """Disable gradient calculation."""
-        self.path_alpha.requires_grad = False
-
-    def probs_over_ops(self):
-        """Apply gumbel softmax to generate probability distribution."""
-        return F.gumbel_softmax(self.path_alpha, self.temperature)
-
-    def forward(self, mutable, x):
-        """
-        Define forward of LayerChoice.
-
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        x : tensor
-            inputs of this layer, only support one input
-
-        Returns
-        -------
-        output: tensor
-            output of this layer
-        """
-        candidate_ops = list(mutable)
-        soft_masks = self.probs_over_ops()
-        output = sum(m * op(x) for m, op in zip(soft_masks, candidate_ops))
-
-        return output
-
-    @property
-    def chosen_index(self):
-        """
-        choose the op with max prob
-
-        Returns
-        -------
-        int
-            index of the chosen one
-        """
-        alphas = self.path_alpha.data.detach().cpu().numpy()
-        index = int(np.argmax(alphas))
-        return index
-
-
-class FBNetMutator(BaseMutator):
-    """
-    This mutator initializes and operates all the LayerChoices of the supernet.
-    It is for the related trainer to control the training flow of LayerChoices,
-    coordinating with whole training process.
-    """
-
-    def __init__(self, model, lookup_table):
-        """
-        Init a MixedOp instance for each mutable i.e., LayerChoice.
-        And register the instantiated MixedOp in corresponding LayerChoice.
-        If does not register it in LayerChoice, DataParallel does'nt work then,
-        for architecture weights are not included in the DataParallel model.
-        When MixedOPs are registered, we use ```requires_grad``` to control
-        whether calculate gradients of architecture weights.
-
-        Parameters
-        ----------
-        model : pytorch model
-            The model that users want to tune,
-            it includes search space defined with nni nas apis
-        lookup_table : class
-            lookup table object to manage model space information,
-            including candidate ops for each stage as the model space,
-            input channels/output channels/stride/fm_size as the layer config,
-            and the performance information for perf_cost accumulation.
-
-        """
-        super(FBNetMutator, self).__init__(model)
-        self.mutable_list = []
-
-        # Collect the op names of the candidate ops within each mutable
-        ops_names_mutable = dict()
-        left = 0
-        right = 1
-        for stage_name in lookup_table.layer_num:
-            right = lookup_table.layer_num[stage_name]
-            stage_ops = lookup_table.lut_ops[stage_name]
-            ops_names = [op_name for op_name in stage_ops]
-
-            for i in range(left, left + right):
-                ops_names_mutable[i] = ops_names
-            left += right
-
-        # Create the mixed op
-        for i, mutable in enumerate(self.undedup_mutables):
-            ops_names = ops_names_mutable[i]
-            latency_mutable = lookup_table.lut_perf[i]
-            latency = [latency_mutable[op_name] for op_name in ops_names]
-            self.mutable_list.append(mutable)
-            mutable.registered_module = MixedOp(mutable, latency)
-
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        Callback of layer choice forward. This function defines the forward
-        logic of the input mutable. So mutable is only interface, its real
-        implementation is defined in mutator.
-
-        Parameters
-        ----------
-        mutable: LayerChoice
-            forward logic of this input mutable
-        args: list of torch.Tensor
-            inputs of this mutable
-        kwargs: dict
-            inputs of this mutable
-
-        Returns
-        -------
-        torch.Tensor
-            output of this mutable, i.e., LayerChoice
-        int
-            index of the chosen op
-        """
-        # FIXME: return mask, to be consistent with other algorithms
-        idx = mutable.registered_module.chosen_index
-        return mutable.registered_module(mutable, *args, **kwargs), idx
-
-    def num_arch_params(self):
-        """
-        The number of mutables, i.e., LayerChoice
-
-        Returns
-        -------
-        int
-            the number of LayerChoice in user model
-        """
-        return len(self.mutable_list)
-
-    def get_architecture_parameters(self):
-        """
-        Get all the architecture parameters.
-
-        yield
-        -----
-        PyTorch Parameter
-            Return path_alpha of the traversed mutable
-        """
-        for mutable in self.undedup_mutables:
-            yield mutable.registered_module.get_path_alpha()
-
-    def get_weighted_latency(self):
-        """
-        Get the latency weighted by gumbel softmax coefficients.
-
-        yield
-        -----
-        Tuple
-            Return the weighted_latency of the traversed mutable
-        """
-        for mutable in self.undedup_mutables:
-            yield mutable.registered_module.get_weighted_latency()
-
-    def set_temperature(self, temperature):
-        """
-        Set the annealed temperature of the op for gumbel softmax.
-
-        Parameters
-        ----------
-        temperature : float
-            The annealed temperature for gumbel softmax
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.set_temperature(temperature)
-
-    def arch_requires_grad(self):
-        """
-        Make architecture weights require gradient
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_requires_grad()
-
-    def arch_disable_grad(self):
-        """
-        Disable gradient of architecture weights, i.e., does not
-        calculate gradient for them.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_disable_grad()
-
-    def sample_final(self):
-        """
-        Generate the final chosen architecture.
-
-        Returns
-        -------
-        dict
-            the choice of each mutable, i.e., LayerChoice
-        """
-        result = dict()
-        for mutable in self.undedup_mutables:
-            assert isinstance(mutable, LayerChoice)
-            index = mutable.registered_module.chosen_index
-            # pylint: disable=not-callable
-            result[mutable.key] = (
-                F.one_hot(torch.tensor(index), num_classes=len(mutable))
-                .view(-1)
-                .bool(),
-            )
-        return result
diff --git a/nni/algorithms/nas/pytorch/fbnet/trainer.py b/nni/algorithms/nas/pytorch/fbnet/trainer.py
deleted file mode 100644
index 1eaababef..000000000
--- a/nni/algorithms/nas/pytorch/fbnet/trainer.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import json
-import os
-import time
-import torch
-
-import numpy as np
-
-from torch.autograd import Variable
-from nni.nas.pytorch.base_trainer import BaseTrainer
-from nni.nas.pytorch.trainer import TorchTensorEncoder
-from nni.nas.pytorch.utils import AverageMeter
-from .mutator import FBNetMutator
-from .utils import RegularizerLoss, accuracy
-
-
-class FBNetTrainer(BaseTrainer):
-    def __init__(
-        self,
-        model,
-        model_optim,
-        criterion,
-        device,
-        device_ids,
-        lookup_table,
-        train_loader,
-        valid_loader,
-        n_epochs=120,
-        load_ckpt=False,
-        arch_path=None,
-        logger=None,
-    ):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            the user model, which has mutables
-        model_optim : pytorch optimizer
-            the user defined optimizer
-        criterion : pytorch loss
-            the main task loss, nn.CrossEntropyLoss() is for classification
-        device : pytorch device
-            the devices to train/search the model
-        device_ids : list of int
-            the indexes of devices used for training
-        lookup_table : class
-            lookup table object for fbnet training
-        train_loader : pytorch data loader
-            data loader for the training set
-        valid_loader : pytorch data loader
-            data loader for the validation set
-        n_epochs : int
-            number of epochs to train/search
-        load_ckpt : bool
-            whether load checkpoint
-        arch_path : str
-            the path to store chosen architecture
-        logger : logger
-            the logger
-        """
-        self.model = model
-        self.model_optim = model_optim
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-        self.device = device
-        self.dev_num = len(device_ids)
-        self.n_epochs = n_epochs
-        self.lookup_table = lookup_table
-        self.config = lookup_table.config
-        self.start_epoch = self.config.start_epoch
-        self.temp = self.config.init_temperature
-        self.exp_anneal_rate = self.config.exp_anneal_rate
-        self.mode = self.config.mode
-
-        self.load_ckpt = load_ckpt
-        self.arch_path = arch_path
-        self.logger = logger
-
-        # scheduler of learning rate
-        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            model_optim, T_max=n_epochs, last_epoch=-1
-        )
-
-        # init mutator
-        self.mutator = FBNetMutator(model, lookup_table)
-        self.mutator.set_temperature(self.temp)
-
-        # DataParallel should be put behind the init of mutator
-        self.model = torch.nn.DataParallel(self.model, device_ids=device_ids)
-        self.model.to(device)
-
-        # build architecture optimizer
-        self.arch_optimizer = torch.optim.AdamW(
-            self.mutator.get_architecture_parameters(),
-            self.config.nas_lr,
-            weight_decay=self.config.nas_weight_decay,
-        )
-        self.reg_loss = RegularizerLoss(config=self.config)
-
-        self.criterion = criterion
-        self.epoch = 0
-
-    def _layer_choice_sample(self):
-        """
-        Sample the index of network within layer choice
-        """
-        stages = [stage_name for stage_name in self.lookup_table.layer_num]
-        stage_lnum = [self.lookup_table.layer_num[stage] for stage in stages]
-
-        # get the choice idx in each layer
-        choice_ids = list()
-        layer_id = 0
-        for param in self.mutator.get_architecture_parameters():
-            param_np = param.cpu().detach().numpy()
-            op_idx = np.argmax(param_np)
-            choice_ids.append(op_idx)
-            self.logger.info(
-                "layer {}: {}, index: {}".format(layer_id, param_np, op_idx)
-            )
-            layer_id += 1
-
-        # get the arch_sample
-        choice_names = list()
-        layer_id = 0
-        for i, stage_name in enumerate(stages):
-            ops_names = [op for op in self.lookup_table.lut_ops[stage_name]]
-            for _ in range(stage_lnum[i]):
-                searched_op = ops_names[choice_ids[layer_id]]
-                choice_names.append(searched_op)
-                layer_id += 1
-
-        self.logger.info(choice_names)
-        return choice_names
-
-    def _get_perf_cost(self, requires_grad=True):
-        """
-        Get the accumulated performance cost.
-        """
-        perf_cost = Variable(
-            torch.zeros(1), requires_grad=requires_grad
-        ).to(self.device, non_blocking=True)
-
-        for latency in self.mutator.get_weighted_latency():
-            perf_cost = perf_cost + latency
-
-        return perf_cost
-
-    def _validate(self):
-        """
-        Do validation. During validation, LayerChoices use the mixed-op.
-
-        Returns
-        -------
-        float, float, float
-            average loss, average top1 accuracy, average top5 accuracy
-        """
-        self.valid_loader.batch_sampler.drop_last = False
-        batch_time = AverageMeter("batch_time")
-        losses = AverageMeter("losses")
-        top1 = AverageMeter("top1")
-        top5 = AverageMeter("top5")
-
-        # test on validation set under eval mode
-        self.model.eval()
-
-        end = time.time()
-        with torch.no_grad():
-            for i, (images, labels) in enumerate(self.valid_loader):
-                images = images.to(self.device, non_blocking=True)
-                labels = labels.to(self.device, non_blocking=True)
-
-                output = self.model(images)
-
-                loss = self.criterion(output, labels)
-                acc1, acc5 = accuracy(output, labels, topk=(1, 5))
-                losses.update(loss, images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-                if i % 10 == 0 or i + 1 == len(self.valid_loader):
-                    test_log = (
-                        "Valid" + ": [{0}/{1}]\t"
-                        "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
-                        "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
-                        "Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t"
-                        "Top-5 acc {top5.val:.3f} ({top5.avg:.3f})".format(
-                            i,
-                            len(self.valid_loader) - 1,
-                            batch_time=batch_time,
-                            loss=losses,
-                            top1=top1,
-                            top5=top5,
-                        )
-                    )
-                    self.logger.info(test_log)
-
-        return losses.avg, top1.avg, top5.avg
-
-    def _train_epoch(self, epoch, optimizer, arch_train=False):
-        """
-        Train one epoch.
-        """
-        batch_time = AverageMeter("batch_time")
-        data_time = AverageMeter("data_time")
-        losses = AverageMeter("losses")
-        top1 = AverageMeter("top1")
-        top5 = AverageMeter("top5")
-
-        # switch to train mode
-        self.model.train()
-
-        data_loader = self.valid_loader if arch_train else self.train_loader
-        end = time.time()
-        for i, (images, labels) in enumerate(data_loader):
-            data_time.update(time.time() - end)
-            images = images.to(self.device, non_blocking=True)
-            labels = labels.to(self.device, non_blocking=True)
-
-            output = self.model(images)
-            loss = self.criterion(output, labels)
-
-            # hardware-aware loss
-            perf_cost = self._get_perf_cost(requires_grad=True)
-            regu_loss = self.reg_loss(perf_cost)
-            if self.mode.startswith("mul"):
-                loss = loss * regu_loss
-            elif self.mode.startswith("add"):
-                loss = loss + regu_loss
-
-            # measure accuracy and record loss
-            acc1, acc5 = accuracy(output, labels, topk=(1, 5))
-            losses.update(loss.item(), images.size(0))
-            top1.update(acc1[0].item(), images.size(0))
-            top5.update(acc5[0].item(), images.size(0))
-            # compute gradient and do SGD step
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            # measure elapsed time
-            batch_time.update(time.time() - end)
-            end = time.time()
-
-            if i % 10 == 0:
-                batch_log = (
-                    "Warmup Train [{0}][{1}]\t"
-                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
-                    "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
-                    "Loss {losses.val:.4f} ({losses.avg:.4f})\t"
-                    "Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t"
-                    "Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\t".format(
-                        epoch + 1,
-                        i,
-                        batch_time=batch_time,
-                        data_time=data_time,
-                        losses=losses,
-                        top1=top1,
-                        top5=top5,
-                    )
-                )
-                self.logger.info(batch_log)
-
-    def _warm_up(self):
-        """
-        Warm up the model, while the architecture weights are not trained.
-        """
-        for epoch in range(self.epoch, self.start_epoch):
-            self.logger.info("\n--------Warmup epoch: %d--------\n", epoch + 1)
-            self._train_epoch(epoch, self.model_optim)
-            # adjust learning rate
-            self.scheduler.step()
-
-            # validation
-            val_loss, val_top1, val_top5 = self._validate()
-            val_log = (
-                "Warmup Valid [{0}/{1}]\t"
-                "loss {2:.3f}\ttop-1 acc {3:.3f}\ttop-5 acc {4:.3f}".format(
-                    epoch + 1, self.warmup_epochs, val_loss, val_top1, val_top5
-                )
-            )
-            self.logger.info(val_log)
-
-            if epoch % 10 == 0:
-                filename = os.path.join(
-                    self.config.model_dir, "checkpoint_%s.pth" % epoch
-                )
-                self.save_checkpoint(epoch, filename)
-
-    def _train(self):
-        """
-        Train the model, it trains model weights and architecute weights.
-        Architecture weights are trained according to the schedule.
-        Before updating architecture weights, ```requires_grad``` is enabled.
-        Then, it is disabled after the updating, in order not to update
-        architecture weights when training model weights.
-        """
-        arch_param_num = self.mutator.num_arch_params()
-        self.logger.info("#arch_params: {}".format(arch_param_num))
-        self.epoch = max(self.start_epoch, self.epoch)
-
-        ckpt_path = self.config.model_dir
-        choice_names = None
-        top1_best = 0.0
-
-        for epoch in range(self.epoch, self.n_epochs):
-            self.logger.info("\n--------Train epoch: %d--------\n", epoch + 1)
-            # update the weight parameters
-            self._train_epoch(epoch, self.model_optim)
-            # adjust learning rate
-            self.scheduler.step()
-
-            self.logger.info("Update architecture parameters")
-            # update the architecture parameters
-            self.mutator.arch_requires_grad()
-            self._train_epoch(epoch, self.arch_optimizer, True)
-            self.mutator.arch_disable_grad()
-            # temperature annealing
-            self.temp = self.temp * self.exp_anneal_rate
-            self.mutator.set_temperature(self.temp)
-            # sample the architecture of sub-network
-            choice_names = self._layer_choice_sample()
-
-            # validate
-            val_loss, val_top1, val_top5 = self._validate()
-            val_log = (
-                "Valid [{0}]\t"
-                "loss {1:.3f}\ttop-1 acc {2:.3f} \ttop-5 acc {3:.3f}".format(
-                    epoch + 1, val_loss, val_top1, val_top5
-                )
-            )
-            self.logger.info(val_log)
-
-            if epoch % 10 == 0:
-                filename = os.path.join(ckpt_path, "checkpoint_%s.pth" % epoch)
-                self.save_checkpoint(epoch, filename, choice_names)
-
-            val_top1 = val_top1.cpu().as_numpy()
-            if val_top1 > top1_best:
-                filename = os.path.join(ckpt_path, "checkpoint_best.pth")
-                self.save_checkpoint(epoch, filename, choice_names)
-                top1_best = val_top1
-
-    def save_checkpoint(self, epoch, filename, choice_names=None):
-        """
-        Save checkpoint of the whole model.
-        Saving model weights and architecture weights as ```filename```,
-        and saving currently chosen architecture in ```arch_path```.
-        """
-        state = {
-            "model": self.model.state_dict(),
-            "optim": self.model_optim.state_dict(),
-            "epoch": epoch,
-            "arch_sample": choice_names,
-        }
-        torch.save(state, filename)
-        self.logger.info("Save checkpoint to {0:}".format(filename))
-
-        if self.arch_path:
-            self.export(self.arch_path)
-
-    def load_checkpoint(self, filename):
-        """
-        Load the checkpoint from ```ckpt_path```.
-        """
-        ckpt = torch.load(filename)
-        self.epoch = ckpt["epoch"]
-        self.model.load_state_dict(ckpt["model"])
-        self.model_optim.load_state_dict(ckpt["optim"])
-
-    def train(self):
-        """
-        Train the whole model.
-        """
-        if self.load_ckpt:
-            ckpt_path = self.config.model_dir
-            filename = os.path.join(ckpt_path, "checkpoint_best.pth")
-            if os.path.exists(filename):
-                self.load_checkpoint(filename)
-
-        if self.epoch < self.start_epoch:
-            self._warm_up()
-        self._train()
-
-    def export(self, file_name):
-        """
-        Export the chosen architecture into a file
-
-        Parameters
-        ----------
-        file_name : str
-            the file that stores exported chosen architecture
-        """
-        exported_arch = self.mutator.sample_final()
-        with open(file_name, "w") as f:
-            json.dump(
-                exported_arch,
-                f,
-                indent=2,
-                sort_keys=True,
-                cls=TorchTensorEncoder,
-            )
-
-    def validate(self):
-        raise NotImplementedError
-
-    def checkpoint(self):
-        raise NotImplementedError
diff --git a/nni/algorithms/nas/pytorch/fbnet/utils.py b/nni/algorithms/nas/pytorch/fbnet/utils.py
deleted file mode 100644
index 77e71746b..000000000
--- a/nni/algorithms/nas/pytorch/fbnet/utils.py
+++ /dev/null
@@ -1,433 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import ast
-import os
-import timeit
-import torch
-
-import numpy as np
-import torch.nn as nn
-
-from nni.compression.pytorch.utils import count_flops_params
-
-LUT_FILE = "lut.npy"
-LUT_JSON_FILE = "lut.txt"
-LUT_PATH = "lut"
-
-DATA_TYPE = "float"
-
-class NASConfig:
-    def __init__(
-        self,
-        perf_metric="flops",
-        lut_load=False,
-        lut_load_format="json",
-        model_dir=None,
-        nas_lr=0.01,
-        nas_weight_decay=5e-4,
-        mode="mul",
-        alpha=0.25,
-        beta=0.6,
-        start_epoch=50,
-        init_temperature=5.0,
-        exp_anneal_rate=np.exp(-0.045),
-        search_space=None,
-    ):
-        # LUT of performance metric
-        # flops means the multiplies, latency means the time cost on platform
-        self.perf_metric = perf_metric
-        assert perf_metric in [
-            "flops",
-            "latency",
-        ], "perf_metric should be ['flops', 'latency']"
-        # wether load or create lut file
-        self.lut_load = lut_load
-
-        assert lut_load_format in [
-            "json",
-            "numpy",
-        ], "lut_load_format should be ['json', 'numpy']"
-        self.lut_load_format = lut_load_format
-
-        # necessary dirs
-        self.lut_en = model_dir is not None
-        if self.lut_en:
-            self.model_dir = model_dir
-            os.makedirs(model_dir, exist_ok=True)
-            self.lut_path = os.path.join(model_dir, LUT_PATH)
-            os.makedirs(self.lut_path, exist_ok=True)
-        # NAS learning setting
-        self.nas_lr = nas_lr
-        self.nas_weight_decay = nas_weight_decay
-        # hardware-aware loss setting
-        self.mode = mode
-        assert mode in ["mul", "add"], "mode should be ['mul', 'add']"
-        self.alpha = alpha
-        self.beta = beta
-        # NAS training setting
-        self.start_epoch = start_epoch
-        self.init_temperature = init_temperature
-        self.exp_anneal_rate = exp_anneal_rate
-        # definition of search blocks and space
-        self.search_space = search_space
-
-
-class RegularizerLoss(nn.Module):
-    """Auxilliary loss for hardware-aware NAS."""
-
-    def __init__(self, config):
-        """
-        Parameters
-        ----------
-        config : class
-            to manage the configuration for NAS training, and search space etc.
-        """
-        super(RegularizerLoss, self).__init__()
-        self.mode = config.mode
-        self.alpha = config.alpha
-        self.beta = config.beta
-
-    def forward(self, perf_cost, batch_size=1):
-        """
-        Parameters
-        ----------
-        perf_cost : tensor
-            the accumulated performance cost
-        batch_size : int
-            batch size for normalization
-
-        Returns
-        -------
-        output: tensor
-            the hardware-aware constraint loss
-        """
-        if self.mode == "mul":
-            log_loss = torch.log(perf_cost / batch_size) ** self.beta
-            return self.alpha * log_loss
-        elif self.mode == "add":
-            linear_loss = (perf_cost / batch_size) ** self.beta
-            return self.alpha * linear_loss
-        else:
-            raise NotImplementedError
-
-
-def accuracy(output, target, topk=(1,)):
-    """
-    Computes the precision@k for the specified values of k
-
-    Parameters
-    ----------
-    output : pytorch tensor
-        output, e.g., predicted value
-    target : pytorch tensor
-        label
-    topk : tuple
-        specify top1 and top5
-
-    Returns
-    -------
-    list
-        accuracy of top1 and top5
-    """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-
-def supernet_sample(model, state_dict, sampled_arch=[], lookup_table=None):
-    """
-    Initialize the searched sub-model from supernet.
-
-    Parameters
-    ----------
-    model : pytorch model
-        the created subnet
-    state_dict : checkpoint
-        the checkpoint of supernet, including the pre-trained params
-    sampled_arch : list of str
-        the searched layer names of the subnet
-    lookup_table : class
-        to manage the candidate ops, layer information and layer performance
-    """
-    replace = list()
-    stages = [stage for stage in lookup_table.layer_num]
-    stage_lnum = [lookup_table.layer_num[stage] for stage in stages]
-
-    if sampled_arch:
-        layer_id = 0
-        for i, stage in enumerate(stages):
-            ops_names = [op_name for op_name in lookup_table.lut_ops[stage]]
-            for _ in range(stage_lnum[i]):
-                searched_op = sampled_arch[layer_id]
-                op_i = ops_names.index(searched_op)
-                replace.append(
-                    [
-                        "blocks.{}.".format(layer_id),
-                        "blocks.{}.op.".format(layer_id),
-                        "blocks.{}.{}.".format(layer_id, op_i),
-                    ]
-                )
-                layer_id += 1
-    model_init(model, state_dict, replace=replace)
-
-
-def model_init(model, state_dict, replace=[]):
-    """Initialize the model from state_dict."""
-    prefix = "module."
-    param_dict = dict()
-    for k, v in state_dict.items():
-        if k.startswith(prefix):
-            k = k[7:]
-        param_dict[k] = v
-
-    for k, (name, m) in enumerate(model.named_modules()):
-        if replace:
-            for layer_replace in replace:
-                assert len(layer_replace) == 3, "The elements should be three."
-                pre_scope, key, replace_key = layer_replace
-                if pre_scope in name:
-                    name = name.replace(key, replace_key)
-
-        # Copy the state_dict to current model
-        if (name + ".weight" in param_dict) or (
-            name + ".running_mean" in param_dict
-        ):
-            if isinstance(m, nn.BatchNorm2d):
-                shape = m.running_mean.shape
-                if shape == param_dict[name + ".running_mean"].shape:
-                    if m.weight is not None:
-                        m.weight.data = param_dict[name + ".weight"]
-                        m.bias.data = param_dict[name + ".bias"]
-                    m.running_mean = param_dict[name + ".running_mean"]
-                    m.running_var = param_dict[name + ".running_var"]
-
-            elif isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                shape = m.weight.data.shape
-                if shape == param_dict[name + ".weight"].shape:
-                    m.weight.data = param_dict[name + ".weight"]
-                    if m.bias is not None:
-                        m.bias.data = param_dict[name + ".bias"]
-
-            elif isinstance(m, nn.ConvTranspose2d):
-                m.weight.data = param_dict[name + ".weight"]
-                if m.bias is not None:
-                    m.bias.data = param_dict[name + ".bias"]
-
-
-class LookUpTable:
-    """Build look-up table for NAS."""
-
-    def __init__(self, config, primitives):
-        """
-        Parameters
-        ----------
-        config : class
-            to manage the configuration for NAS training, and search space etc.
-        """
-        self.config = config
-        # definition of search blocks and space
-        self.search_space = config.search_space
-        # layers for NAS
-        self.cnt_layers = len(self.search_space["input_shape"])
-        # constructors for each operation
-        self.lut_ops = {
-            stage_name: {
-                op_name: primitives[op_name]
-                for op_name in self.search_space["stages"][stage_name]["ops"]
-            }
-            for stage_name in self.search_space["stages"]
-        }
-        self.layer_num = {
-            stage_name: self.search_space["stages"][stage_name]["layer_num"]
-            for stage_name in self.search_space["stages"]
-        }
-
-        # arguments for the ops constructors, input_shapes just for convinience
-        self.layer_configs, self.layer_in_shapes = self._layer_configs()
-
-        # lookup_table
-        self.perf_metric = config.perf_metric
-
-        if config.lut_en:
-            self.lut_perf = None
-            self.lut_file = os.path.join(config.lut_path, LUT_FILE)
-            self.lut_json_file = LUT_JSON_FILE
-            if config.lut_load:
-                if config.lut_load_format == "numpy":
-                    # Load data from numpy file
-                    self._load_from_file()
-                else:
-                    # Load data from json file
-                    self._load_from_json_file()
-            else:
-                self._create_perfs()
-
-    def _layer_configs(self):
-        """Generate basic params for different layers."""
-        # layer_configs are : c_in, c_out, stride, fm_size
-        layer_configs = [
-            [
-                self.search_space["input_shape"][layer_id][0],
-                self.search_space["channel_size"][layer_id],
-                self.search_space["strides"][layer_id],
-                self.search_space["fm_size"][layer_id],
-            ]
-            for layer_id in range(self.cnt_layers)
-        ]
-
-        # layer_in_shapes are (C_in, input_w, input_h)
-        layer_in_shapes = self.search_space["input_shape"]
-
-        return layer_configs, layer_in_shapes
-
-    def _create_perfs(self, cnt_of_runs=200):
-        """Create performance cost for each op."""
-        if self.perf_metric == "latency":
-            self.lut_perf = self._calculate_latency(cnt_of_runs)
-        elif self.perf_metric == "flops":
-            self.lut_perf = self._calculate_flops()
-
-        self._write_lut_to_file()
-
-    def _calculate_flops(self, eps=0.001):
-        """FLOPs cost."""
-        flops_lut = [{} for i in range(self.cnt_layers)]
-        layer_id = 0
-
-        for stage_name in self.lut_ops:
-            stage_ops = self.lut_ops[stage_name]
-            ops_num = self.layer_num[stage_name]
-
-            for _ in range(ops_num):
-                for op_name in stage_ops:
-                    layer_config = self.layer_configs[layer_id]
-                    key_params = {"fm_size": layer_config[3]}
-                    op = stage_ops[op_name](*layer_config[0:3], **key_params)
-
-                    # measured in Flops
-                    in_shape = self.layer_in_shapes[layer_id]
-                    x = (1, in_shape[0], in_shape[1], in_shape[2])
-                    flops, _, _ = count_flops_params(op, x, verbose=False)
-                    flops = eps if flops == 0.0 else flops
-                    flops_lut[layer_id][op_name] = float(flops)
-                layer_id += 1
-
-        return flops_lut
-
-    def _calculate_latency(self, cnt_of_runs):
-        """Latency cost."""
-        LATENCY_BATCH_SIZE = 1
-        latency_lut = [{} for i in range(self.cnt_layers)]
-        layer_id = 0
-
-        for stage_name in self.lut_ops:
-            stage_ops = self.lut_ops[stage_name]
-            ops_num = self.layer_num[stage_name]
-
-            for _ in range(ops_num):
-                for op_name in stage_ops:
-                    layer_config = self.layer_configs[layer_id]
-                    key_params = {"fm_size": layer_config[3]}
-                    op = stage_ops[op_name](*layer_config[0:3], **key_params)
-                    input_data = torch.randn(
-                        (LATENCY_BATCH_SIZE, *self.layer_in_shapes[layer_id])
-                    )
-                    globals()["op"], globals()["input_data"] = op, input_data
-                    total_time = timeit.timeit(
-                        "output = op(input_data)",
-                        setup="gc.enable()",
-                        globals=globals(),
-                        number=cnt_of_runs,
-                    )
-                    # measured in micro-second
-                    latency_lut[layer_id][op_name] = (
-                        total_time / cnt_of_runs / LATENCY_BATCH_SIZE * 1e6
-                    )
-                layer_id += 1
-
-        return latency_lut
-
-    def _write_lut_to_file(self):
-        """Save lut as numpy file."""
-        np.save(self.lut_file, self.lut_perf)
-
-    def _load_from_file(self):
-        """Load numpy file."""
-        self.lut_perf = np.load(self.lut_file, allow_pickle=True)
-
-    def _load_from_json_file(self):
-        """Load json file."""
-
-        """
-        lut_json_file ('lut.txt') format:
-            {'op_name': operator_name,
-             'op_data_shape': (input_w, input_h, C_in, C_out, stride),
-             'op_dtype': data_type,
-             'op_latency': latency}
-            {...}
-            {...}
-        """
-        latency_file = open(self.lut_json_file, "r")
-        ops_latency = latency_file.readlines()
-
-        """ops_lut: {'op_name': {'op_data_shape': {'op_dtype': latency}}}"""
-        ops_lut = {}
-
-        for op_latency in ops_latency:
-            assert isinstance(op_latency, str) or isinstance(op_latency, dict)
-
-            if isinstance(op_latency, str):
-                record = ast.literal_eval(op_latency)
-            elif isinstance(op_latency, dict):
-                record = op_latency
-
-            op_name = record["op_name"]
-            """op_data_shape: (input_w, input_h, C_in, C_out, stride)"""
-            op_data_shape = record["op_data_shape"]
-            op_dtype = record["op_dtype"]
-            op_latency = record["op_latency"]
-
-            if op_name not in ops_lut:
-                ops_lut[op_name] = {}
-
-            if op_data_shape not in ops_lut[op_name]:
-                ops_lut[op_name][op_data_shape] = {}
-
-            ops_lut[op_name][op_data_shape][op_dtype] = op_latency
-
-        self.lut_perf = [{} for i in range(self.cnt_layers)]
-        layer_id = 0
-
-        for stage_name in self.lut_ops:
-            stage_ops = self.lut_ops[stage_name]
-            ops_num = self.layer_num[stage_name]
-
-            for _ in range(ops_num):
-                for op_name in stage_ops:
-                    layer_config = self.layer_configs[layer_id]
-                    layer_in_shape = self.layer_in_shapes[layer_id]
-
-                    input_w = layer_in_shape[1]
-                    input_h = layer_in_shape[2]
-                    c_in = layer_config[0]
-                    c_out = layer_config[1]
-                    stride = layer_config[2]
-                    op_data_shape = (input_w, input_h, c_in, c_out, stride)
-
-                    if op_name in ops_lut and op_data_shape in ops_lut[op_name]:
-                        self.lut_perf[layer_id][op_name] = \
-                            ops_lut[op_name][op_data_shape][DATA_TYPE]
-
-                layer_id += 1
diff --git a/nni/algorithms/nas/pytorch/pdarts/__init__.py b/nni/algorithms/nas/pytorch/pdarts/__init__.py
deleted file mode 100644
index d1d17764b..000000000
--- a/nni/algorithms/nas/pytorch/pdarts/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .trainer import PdartsTrainer
diff --git a/nni/algorithms/nas/pytorch/pdarts/mutator.py b/nni/algorithms/nas/pytorch/pdarts/mutator.py
deleted file mode 100644
index 09ad51c5e..000000000
--- a/nni/algorithms/nas/pytorch/pdarts/mutator.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import copy
-
-import numpy as np
-import torch
-from torch import nn
-
-from nni.algorithms.nas.pytorch.darts import DartsMutator
-from nni.nas.pytorch.mutables import LayerChoice
-
-
-class PdartsMutator(DartsMutator):
-    """
-    It works with PdartsTrainer to calculate ops weights,
-    and drop weights in different PDARTS epochs.
-    """
-
-    def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
-        self.pdarts_epoch_index = pdarts_epoch_index
-        self.pdarts_num_to_drop = pdarts_num_to_drop
-        if switches is None:
-            self.switches = {}
-        else:
-            self.switches = switches
-
-        super(PdartsMutator, self).__init__(model)
-
-        # this loop go through mutables with different keys,
-        # it's mainly to update length of choices.
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-
-                switches = self.switches.get(mutable.key, [True for j in range(len(mutable))])
-                choices = self.choices[mutable.key]
-
-                operations_count = np.sum(switches)
-                # +1 and -1 are caused by zero operation in darts network
-                # the zero operation is not in choices list in network, but its weight are in,
-                # so it needs one more weights and switch for zero.
-                self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(operations_count + 1))
-                self.switches[mutable.key] = switches
-
-        # update LayerChoice instances in model,
-        # it's physically remove dropped choices operations.
-        for module in self.model.modules():
-            if isinstance(module, LayerChoice):
-                switches = self.switches.get(module.key)
-                choices = self.choices[module.key]
-                if len(module) > len(choices):
-                    # from last to first, so that it won't effect previous indexes after removed one.
-                    for index in range(len(switches)-1, -1, -1):
-                        if switches[index] == False:
-                            del module[index]
-                assert len(module) <= len(choices), "Failed to remove dropped choices."
-
-    def export(self):
-        # Cannot rely on super().export() because P-DARTS has deleted some of the choices and has misaligned length.
-        results = super().sample_final()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                # As some operations are dropped physically,
-                # so it needs to fill back false to track dropped operations.
-                trained_result = results[mutable.key]
-                trained_index = 0
-                switches = self.switches[mutable.key]
-                result = torch.Tensor(switches).bool()
-                for index in range(len(result)):
-                    if result[index]:
-                        result[index] = trained_result[trained_index]
-                        trained_index += 1
-                results[mutable.key] = result
-        return results
-
-    def drop_paths(self):
-        """
-        This method is called when a PDARTS epoch is finished.
-        It prepares switches for next epoch.
-        candidate operations with False switch will be doppped in next epoch.
-        """
-        all_switches = copy.deepcopy(self.switches)
-        for key in all_switches:
-            switches = all_switches[key]
-            idxs = []
-            for j in range(len(switches)):
-                if switches[j]:
-                    idxs.append(j)
-            sorted_weights = self.choices[key].data.cpu().numpy()[:-1]
-            drop = np.argsort(sorted_weights)[:self.pdarts_num_to_drop[self.pdarts_epoch_index]]
-            for idx in drop:
-                switches[idxs[idx]] = False
-        return all_switches
diff --git a/nni/algorithms/nas/pytorch/pdarts/trainer.py b/nni/algorithms/nas/pytorch/pdarts/trainer.py
deleted file mode 100644
index 7f23a6e22..000000000
--- a/nni/algorithms/nas/pytorch/pdarts/trainer.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-
-from nni.nas.pytorch.callbacks import LRSchedulerCallback
-from nni.algorithms.nas.pytorch.darts import DartsTrainer
-from nni.nas.pytorch.trainer import BaseTrainer, TorchTensorEncoder
-
-from .mutator import PdartsMutator
-
-logger = logging.getLogger(__name__)
-
-
-class PdartsTrainer(BaseTrainer):
-    """
-    This trainer implements the PDARTS algorithm.
-    PDARTS bases on DARTS algorithm, and provides a network growth approach to find deeper and better network.
-    This class relies on pdarts_num_layers and pdarts_num_to_drop parameters to control how network grows.
-    pdarts_num_layers means how many layers more than first epoch.
-    pdarts_num_to_drop means how many candidate operations should be dropped in each epoch.
-        So that the grew network can in similar size.
-    """
-
-    def __init__(self, model_creator, init_layers, metrics,
-                 num_epochs, dataset_train, dataset_valid,
-                 pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 1],
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None, unrolled=False):
-        super(PdartsTrainer, self).__init__()
-        self.model_creator = model_creator
-        self.init_layers = init_layers
-        self.pdarts_num_layers = pdarts_num_layers
-        self.pdarts_num_to_drop = pdarts_num_to_drop
-        self.pdarts_epoch = len(pdarts_num_to_drop)
-        self.darts_parameters = {
-            "metrics": metrics,
-            "num_epochs": num_epochs,
-            "dataset_train": dataset_train,
-            "dataset_valid": dataset_valid,
-            "batch_size": batch_size,
-            "workers": workers,
-            "device": device,
-            "log_frequency": log_frequency,
-            "unrolled": unrolled
-        }
-        self.callbacks = callbacks if callbacks is not None else []
-
-    def train(self):
-
-        switches = None
-        for epoch in range(self.pdarts_epoch):
-
-            layers = self.init_layers+self.pdarts_num_layers[epoch]
-            model, criterion, optim, lr_scheduler = self.model_creator(layers)
-            self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
-
-            for callback in self.callbacks:
-                callback.build(model, self.mutator, self)
-                callback.on_epoch_begin(epoch)
-
-            darts_callbacks = []
-            if lr_scheduler is not None:
-                darts_callbacks.append(LRSchedulerCallback(lr_scheduler))
-
-            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
-                                        callbacks=darts_callbacks, **self.darts_parameters)
-            logger.info("start pdarts training epoch %s...", epoch)
-
-            self.trainer.train()
-
-            switches = self.mutator.drop_paths()
-
-            for callback in self.callbacks:
-                callback.on_epoch_end(epoch)
-
-    def validate(self):
-        self.trainer.validate()
-
-    def export(self, file):
-        mutator_export = self.mutator.export()
-        with open(file, "w") as f:
-            json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-
-    def checkpoint(self):
-        raise NotImplementedError("Not implemented yet")
diff --git a/nni/algorithms/nas/pytorch/proxylessnas/__init__.py b/nni/algorithms/nas/pytorch/proxylessnas/__init__.py
deleted file mode 100644
index 3188fbf45..000000000
--- a/nni/algorithms/nas/pytorch/proxylessnas/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import ProxylessNasMutator
-from .trainer import ProxylessNasTrainer
diff --git a/nni/algorithms/nas/pytorch/proxylessnas/mutator.py b/nni/algorithms/nas/pytorch/proxylessnas/mutator.py
deleted file mode 100644
index 881a6b440..000000000
--- a/nni/algorithms/nas/pytorch/proxylessnas/mutator.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-import numpy as np
-
-from nni.nas.pytorch.base_mutator import BaseMutator
-from nni.nas.pytorch.mutables import LayerChoice
-from .utils import detach_variable
-
-class ArchGradientFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, binary_gates, run_func, backward_func):
-        ctx.run_func = run_func
-        ctx.backward_func = backward_func
-
-        detached_x = detach_variable(x)
-        with torch.enable_grad():
-            output = run_func(detached_x)
-        ctx.save_for_backward(detached_x, output)
-        return output.data
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        detached_x, output = ctx.saved_tensors
-
-        grad_x = torch.autograd.grad(output, detached_x, grad_output, only_inputs=True)
-        # compute gradients w.r.t. binary_gates
-        binary_grads = ctx.backward_func(detached_x.data, output.data, grad_output.data)
-
-        return grad_x[0], binary_grads, None, None
-
-class MixedOp(nn.Module):
-    """
-    This class is to instantiate and manage info of one LayerChoice.
-    It includes architecture weights, binary weights, and member functions
-    operating the weights.
-
-    forward_mode:
-        forward/backward mode for LayerChoice: None, two, full, and full_v2.
-        For training architecture weights, we use full_v2 by default, and for training
-        model weights, we use None.
-    """
-    forward_mode = None
-    def __init__(self, mutable):
-        """
-        Parameters
-        ----------
-        mutable : LayerChoice
-            A LayerChoice in user model
-        """
-        super(MixedOp, self).__init__()
-        self.ap_path_alpha = nn.Parameter(torch.Tensor(len(mutable)))
-        self.ap_path_wb = nn.Parameter(torch.Tensor(len(mutable)))
-        self.ap_path_alpha.requires_grad = False
-        self.ap_path_wb.requires_grad = False
-        self.active_index = [0]
-        self.inactive_index = None
-        self.log_prob = None
-        self.current_prob_over_ops = None
-        self.n_choices = len(mutable)
-
-    def get_ap_path_alpha(self):
-        return self.ap_path_alpha
-
-    def to_requires_grad(self):
-        self.ap_path_alpha.requires_grad = True
-        self.ap_path_wb.requires_grad = True
-
-    def to_disable_grad(self):
-        self.ap_path_alpha.requires_grad = False
-        self.ap_path_wb.requires_grad = False
-
-    def forward(self, mutable, x):
-        """
-        Define forward of LayerChoice. For 'full_v2', backward is also defined.
-        The 'two' mode is explained in section 3.2.1 in the paper.
-        The 'full_v2' mode is explained in Appendix D in the paper.
-
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        x : tensor
-            inputs of this layer, only support one input
-
-        Returns
-        -------
-        output: tensor
-            output of this layer
-        """
-        if MixedOp.forward_mode == 'full' or MixedOp.forward_mode == 'two':
-            output = 0
-            for _i in self.active_index:
-                oi = self.candidate_ops[_i](x)
-                output = output + self.ap_path_wb[_i] * oi
-            for _i in self.inactive_index:
-                oi = self.candidate_ops[_i](x)
-                output = output + self.ap_path_wb[_i] * oi.detach()
-        elif MixedOp.forward_mode == 'full_v2':
-            def run_function(key, candidate_ops, active_id):
-                def forward(_x):
-                    return candidate_ops[active_id](_x)
-                return forward
-
-            def backward_function(key, candidate_ops, active_id, binary_gates):
-                def backward(_x, _output, grad_output):
-                    binary_grads = torch.zeros_like(binary_gates.data)
-                    with torch.no_grad():
-                        for k in range(len(candidate_ops)):
-                            if k != active_id:
-                                out_k = candidate_ops[k](_x.data)
-                            else:
-                                out_k = _output.data
-                            grad_k = torch.sum(out_k * grad_output)
-                            binary_grads[k] = grad_k
-                    return binary_grads
-                return backward
-            output = ArchGradientFunction.apply(
-                x, self.ap_path_wb, run_function(mutable.key, list(mutable), self.active_index[0]),
-                backward_function(mutable.key, list(mutable), self.active_index[0], self.ap_path_wb))
-        else:
-            output = self.active_op(mutable)(x)
-        return output
-
-    @property
-    def probs_over_ops(self):
-        """
-        Apply softmax on alpha to generate probability distribution
-
-        Returns
-        -------
-        pytorch tensor
-            probability distribution
-        """
-        probs = F.softmax(self.ap_path_alpha, dim=0)  # softmax to probability
-        return probs
-
-    @property
-    def chosen_index(self):
-        """
-        choose the op with max prob
-
-        Returns
-        -------
-        int
-            index of the chosen one
-        numpy.float32
-            prob of the chosen one
-        """
-        probs = self.probs_over_ops.data.cpu().numpy()
-        index = int(np.argmax(probs))
-        return index, probs[index]
-
-    def active_op(self, mutable):
-        """
-        assume only one path is active
-
-        Returns
-        -------
-        PyTorch module
-            the chosen operation
-        """
-        return mutable[self.active_index[0]]
-
-    @property
-    def active_op_index(self):
-        """
-        return active op's index, the active op is sampled
-
-        Returns
-        -------
-        int
-            index of the active op
-        """
-        return self.active_index[0]
-
-    def set_chosen_op_active(self):
-        """
-        set chosen index, active and inactive indexes
-        """
-        chosen_idx, _ = self.chosen_index
-        self.active_index = [chosen_idx]
-        self.inactive_index = [_i for _i in range(0, chosen_idx)] + \
-                              [_i for _i in range(chosen_idx + 1, self.n_choices)]
-
-    def binarize(self, mutable):
-        """
-        Sample based on alpha, and set binary weights accordingly.
-        ap_path_wb is set in this function, which is called binarize.
-
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        """
-        self.log_prob = None
-        # reset binary gates
-        self.ap_path_wb.data.zero_()
-        probs = self.probs_over_ops
-        if MixedOp.forward_mode == 'two':
-            # sample two ops according to probs
-            sample_op = torch.multinomial(probs.data, 2, replacement=False)
-            probs_slice = F.softmax(torch.stack([
-                self.ap_path_alpha[idx] for idx in sample_op
-            ]), dim=0)
-            self.current_prob_over_ops = torch.zeros_like(probs)
-            for i, idx in enumerate(sample_op):
-                self.current_prob_over_ops[idx] = probs_slice[i]
-            # choose one to be active and the other to be inactive according to probs_slice
-            c = torch.multinomial(probs_slice.data, 1)[0] # 0 or 1
-            active_op = sample_op[c].item()
-            inactive_op = sample_op[1-c].item()
-            self.active_index = [active_op]
-            self.inactive_index = [inactive_op]
-            # set binary gate
-            self.ap_path_wb.data[active_op] = 1.0
-        else:
-            sample = torch.multinomial(probs, 1)[0].item()
-            self.active_index = [sample]
-            self.inactive_index = [_i for _i in range(0, sample)] + \
-                                [_i for _i in range(sample + 1, len(mutable))]
-            self.log_prob = torch.log(probs[sample])
-            self.current_prob_over_ops = probs
-            self.ap_path_wb.data[sample] = 1.0
-        # avoid over-regularization
-        for choice in mutable:
-            for _, param in choice.named_parameters():
-                param.grad = None
-
-    @staticmethod
-    def delta_ij(i, j):
-        if i == j:
-            return 1
-        else:
-            return 0
-
-    def set_arch_param_grad(self, mutable):
-        """
-        Calculate alpha gradient for this LayerChoice.
-        It is calculated using gradient of binary gate, probs of ops.
-        """
-        binary_grads = self.ap_path_wb.grad.data
-        if self.active_op(mutable).is_zero_layer():
-            self.ap_path_alpha.grad = None
-            return
-        if self.ap_path_alpha.grad is None:
-            self.ap_path_alpha.grad = torch.zeros_like(self.ap_path_alpha.data)
-        if MixedOp.forward_mode == 'two':
-            involved_idx = self.active_index + self.inactive_index
-            probs_slice = F.softmax(torch.stack([
-                self.ap_path_alpha[idx] for idx in involved_idx
-            ]), dim=0).data
-            for i in range(2):
-                for j in range(2):
-                    origin_i = involved_idx[i]
-                    origin_j = involved_idx[j]
-                    self.ap_path_alpha.grad.data[origin_i] += \
-                        binary_grads[origin_j] * probs_slice[j] * (MixedOp.delta_ij(i, j) - probs_slice[i])
-            for _i, idx in enumerate(self.active_index):
-                self.active_index[_i] = (idx, self.ap_path_alpha.data[idx].item())
-            for _i, idx in enumerate(self.inactive_index):
-                self.inactive_index[_i] = (idx, self.ap_path_alpha.data[idx].item())
-        else:
-            probs = self.probs_over_ops.data
-            for i in range(self.n_choices):
-                for j in range(self.n_choices):
-                    self.ap_path_alpha.grad.data[i] += binary_grads[j] * probs[j] * (MixedOp.delta_ij(i, j) - probs[i])
-        return
-
-    def rescale_updated_arch_param(self):
-        """
-        rescale architecture weights for the 'two' mode.
-        """
-        if not isinstance(self.active_index[0], tuple):
-            assert self.active_op.is_zero_layer()
-            return
-        involved_idx = [idx for idx, _ in (self.active_index + self.inactive_index)]
-        old_alphas = [alpha for _, alpha in (self.active_index + self.inactive_index)]
-        new_alphas = [self.ap_path_alpha.data[idx] for idx in involved_idx]
-
-        offset = math.log(
-            sum([math.exp(alpha) for alpha in new_alphas]) / sum([math.exp(alpha) for alpha in old_alphas])
-        )
-
-        for idx in involved_idx:
-            self.ap_path_alpha.data[idx] -= offset
-
-
-class ProxylessNasMutator(BaseMutator):
-    """
-    This mutator initializes and operates all the LayerChoices of the input model.
-    It is for the corresponding trainer to control the training process of LayerChoices,
-    coordinating with whole training process.
-    """
-    def __init__(self, model):
-        """
-        Init a MixedOp instance for each mutable i.e., LayerChoice.
-        And register the instantiated MixedOp in corresponding LayerChoice.
-        If does not register it in LayerChoice, DataParallel does not work then,
-        because architecture weights are not included in the DataParallel model.
-        When MixedOPs are registered, we use ```requires_grad``` to control
-        whether calculate gradients of architecture weights.
-
-        Parameters
-        ----------
-        model : pytorch model
-            The model that users want to tune, it includes search space defined with nni nas apis
-        """
-        super(ProxylessNasMutator, self).__init__(model)
-        self._unused_modules = None
-        self.mutable_list = []
-        for mutable in self.undedup_mutables:
-            self.mutable_list.append(mutable)
-            mutable.registered_module = MixedOp(mutable)
-
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        Callback of layer choice forward. This function defines the forward
-        logic of the input mutable. So mutable is only interface, its real
-        implementation is defined in mutator.
-
-        Parameters
-        ----------
-        mutable: LayerChoice
-            forward logic of this input mutable
-        args: list of torch.Tensor
-            inputs of this mutable
-        kwargs: dict
-            inputs of this mutable
-
-        Returns
-        -------
-        torch.Tensor
-            output of this mutable, i.e., LayerChoice
-        int
-            index of the chosen op
-        """
-        # FIXME: return mask, to be consistent with other algorithms
-        idx = mutable.registered_module.active_op_index
-        return mutable.registered_module(mutable, *args, **kwargs), idx
-
-    def reset_binary_gates(self):
-        """
-        For each LayerChoice, binarize binary weights
-        based on alpha to only activate one op.
-        It traverses all the mutables in the model to do this.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.binarize(mutable)
-
-    def set_chosen_op_active(self):
-        """
-        For each LayerChoice, set the op with highest alpha as the chosen op.
-        Usually used for validation.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.set_chosen_op_active()
-
-    def num_arch_params(self):
-        """
-        The number of mutables, i.e., LayerChoice
-
-        Returns
-        -------
-        int
-            the number of LayerChoice in user model
-        """
-        return len(self.mutable_list)
-
-    def set_arch_param_grad(self):
-        """
-        For each LayerChoice, calculate gradients for architecture weights, i.e., alpha
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.set_arch_param_grad(mutable)
-
-    def get_architecture_parameters(self):
-        """
-        Get all the architecture parameters.
-
-        yield
-        -----
-        PyTorch Parameter
-            Return ap_path_alpha of the traversed mutable
-        """
-        for mutable in self.undedup_mutables:
-            yield mutable.registered_module.get_ap_path_alpha()
-
-    def change_forward_mode(self, mode):
-        """
-        Update forward mode of MixedOps, as training architecture weights and
-        model weights use different forward modes.
-        """
-        MixedOp.forward_mode = mode
-
-    def get_forward_mode(self):
-        """
-        Get forward mode of MixedOp
-
-        Returns
-        -------
-        string
-            the current forward mode of MixedOp
-        """
-        return MixedOp.forward_mode
-
-    def rescale_updated_arch_param(self):
-        """
-        Rescale architecture weights in 'two' mode.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.rescale_updated_arch_param()
-
-    def unused_modules_off(self):
-        """
-        Remove unused modules for each mutables.
-        The removed modules are kept in ```self._unused_modules``` for resume later.
-        """
-        self._unused_modules = []
-        for mutable in self.undedup_mutables:
-            mixed_op = mutable.registered_module
-            unused = {}
-            if self.get_forward_mode() in ['full', 'two', 'full_v2']:
-                involved_index = mixed_op.active_index + mixed_op.inactive_index
-            else:
-                involved_index = mixed_op.active_index
-            for i in range(mixed_op.n_choices):
-                if i not in involved_index:
-                    unused[i] = mutable[i]
-                    mutable[i] = None
-            self._unused_modules.append(unused)
-
-    def unused_modules_back(self):
-        """
-        Resume the removed modules back.
-        """
-        if self._unused_modules is None:
-            return
-        for m, unused in zip(self.mutable_list, self._unused_modules):
-            for i in unused:
-                m[i] = unused[i]
-        self._unused_modules = None
-
-    def arch_requires_grad(self):
-        """
-        Make architecture weights require gradient
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_requires_grad()
-
-    def arch_disable_grad(self):
-        """
-        Disable gradient of architecture weights, i.e., does not
-        calcuate gradient for them.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_disable_grad()
-
-    def sample_final(self):
-        """
-        Generate the final chosen architecture.
-
-        Returns
-        -------
-        dict
-            the choice of each mutable, i.e., LayerChoice
-        """
-        result = dict()
-        for mutable in self.undedup_mutables:
-            assert isinstance(mutable, LayerChoice)
-            index, _ = mutable.registered_module.chosen_index
-            # pylint: disable=not-callable
-            result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=len(mutable)).view(-1).bool()
-        return result
diff --git a/nni/algorithms/nas/pytorch/proxylessnas/trainer.py b/nni/algorithms/nas/pytorch/proxylessnas/trainer.py
deleted file mode 100644
index d9c86a6a9..000000000
--- a/nni/algorithms/nas/pytorch/proxylessnas/trainer.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-import time
-import json
-import logging
-
-import torch
-from torch import nn as nn
-
-from nni.nas.pytorch.base_trainer import BaseTrainer
-from nni.nas.pytorch.trainer import TorchTensorEncoder
-from nni.nas.pytorch.utils import AverageMeter
-from .mutator import ProxylessNasMutator
-from .utils import cross_entropy_with_label_smoothing, accuracy
-
-logger = logging.getLogger(__name__)
-
-class ProxylessNasTrainer(BaseTrainer):
-    def __init__(self, model, model_optim, device,
-                 train_loader, valid_loader, label_smoothing=0.1,
-                 n_epochs=120, init_lr=0.025, binary_mode='full_v2',
-                 arch_init_type='normal', arch_init_ratio=1e-3,
-                 arch_optim_lr=1e-3, arch_weight_decay=0,
-                 grad_update_arch_param_every=5, grad_update_steps=1,
-                 warmup=True, warmup_epochs=25,
-                 arch_valid_frequency=1,
-                 load_ckpt=False, ckpt_path=None, arch_path=None):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            the user model, which has mutables
-        model_optim : pytorch optimizer
-            the user defined optimizer
-        device : pytorch device
-            the devices to train/search the model
-        train_loader : pytorch data loader
-            data loader for the training set
-        valid_loader : pytorch data loader
-            data loader for the validation set
-        label_smoothing : float
-            for label smoothing
-        n_epochs : int
-            number of epochs to train/search
-        init_lr : float
-            init learning rate for training the model
-        binary_mode : str
-            the forward/backward mode for the binary weights in mutator
-        arch_init_type : str
-            the way to init architecture parameters
-        arch_init_ratio : float
-            the ratio to init architecture parameters
-        arch_optim_lr : float
-            learning rate of the architecture parameters optimizer
-        arch_weight_decay : float
-            weight decay of the architecture parameters optimizer
-        grad_update_arch_param_every : int
-            update architecture weights every this number of minibatches
-        grad_update_steps : int
-            during each update of architecture weights, the number of steps to train
-        warmup : bool
-            whether to do warmup
-        warmup_epochs : int
-            the number of epochs to do during warmup
-        arch_valid_frequency : int
-            frequency of printing validation result
-        load_ckpt : bool
-            whether load checkpoint
-        ckpt_path : str
-            checkpoint path, if load_ckpt is True, ckpt_path cannot be None
-        arch_path : str
-            the path to store chosen architecture
-        """
-        self.model = model
-        self.model_optim = model_optim
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-        self.device = device
-        self.n_epochs = n_epochs
-        self.init_lr = init_lr
-        self.warmup = warmup
-        self.warmup_epochs = warmup_epochs
-        self.arch_valid_frequency = arch_valid_frequency
-        self.label_smoothing = label_smoothing
-
-        self.train_batch_size = train_loader.batch_sampler.batch_size
-        self.valid_batch_size = valid_loader.batch_sampler.batch_size
-        # update architecture parameters every this number of minibatches
-        self.grad_update_arch_param_every = grad_update_arch_param_every
-        # the number of steps per architecture parameter update
-        self.grad_update_steps = grad_update_steps
-        self.binary_mode = binary_mode
-
-        self.load_ckpt = load_ckpt
-        self.ckpt_path = ckpt_path
-        self.arch_path = arch_path
-
-        # init mutator
-        self.mutator = ProxylessNasMutator(model)
-
-        # DataParallel should be put behind the init of mutator
-        self.model = torch.nn.DataParallel(self.model)
-        self.model.to(self.device)
-
-        # iter of valid dataset for training architecture weights
-        self._valid_iter = None
-        # init architecture weights
-        self._init_arch_params(arch_init_type, arch_init_ratio)
-        # build architecture optimizer
-        self.arch_optimizer = torch.optim.Adam(self.mutator.get_architecture_parameters(),
-                                               arch_optim_lr,
-                                               weight_decay=arch_weight_decay,
-                                               betas=(0, 0.999),
-                                               eps=1e-8)
-
-        self.criterion = nn.CrossEntropyLoss()
-        self.warmup_curr_epoch = 0
-        self.train_curr_epoch = 0
-
-    def _init_arch_params(self, init_type='normal', init_ratio=1e-3):
-        """
-        Initialize architecture weights
-        """
-        for param in self.mutator.get_architecture_parameters():
-            if init_type == 'normal':
-                param.data.normal_(0, init_ratio)
-            elif init_type == 'uniform':
-                param.data.uniform_(-init_ratio, init_ratio)
-            else:
-                raise NotImplementedError
-
-    def _validate(self):
-        """
-        Do validation. During validation, LayerChoices use the chosen active op.
-
-        Returns
-        -------
-        float, float, float
-            average loss, average top1 accuracy, average top5 accuracy
-        """
-        self.valid_loader.batch_sampler.batch_size = self.valid_batch_size
-        self.valid_loader.batch_sampler.drop_last = False
-
-        self.mutator.set_chosen_op_active()
-        # remove unused modules to save memory
-        self.mutator.unused_modules_off()
-        # test on validation set under train mode
-        self.model.train()
-        batch_time = AverageMeter('batch_time')
-        losses = AverageMeter('losses')
-        top1 = AverageMeter('top1')
-        top5 = AverageMeter('top5')
-        end = time.time()
-        with torch.no_grad():
-            for i, (images, labels) in enumerate(self.valid_loader):
-                images, labels = images.to(self.device), labels.to(self.device)
-                output = self.model(images)
-                loss = self.criterion(output, labels)
-                acc1, acc5 = accuracy(output, labels, topk=(1, 5))
-                losses.update(loss, images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-                if i % 10 == 0 or i + 1 == len(self.valid_loader):
-                    test_log = 'Valid' + ': [{0}/{1}]\t'\
-                                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\
-                                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'\
-                                        'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})'.\
-                        format(i, len(self.valid_loader) - 1, batch_time=batch_time, loss=losses, top1=top1)
-                    # return top5:
-                    test_log += '\tTop-5 acc {top5.val:.3f} ({top5.avg:.3f})'.format(top5=top5)
-                    logger.info(test_log)
-        self.mutator.unused_modules_back()
-        return losses.avg, top1.avg, top5.avg
-
-    def _warm_up(self):
-        """
-        Warm up the model, during warm up, architecture weights are not trained.
-        """
-        lr_max = 0.05
-        data_loader = self.train_loader
-        nBatch = len(data_loader)
-        T_total = self.warmup_epochs * nBatch # total num of batches
-
-        for epoch in range(self.warmup_curr_epoch, self.warmup_epochs):
-            logger.info('\n--------Warmup epoch: %d--------\n', epoch + 1)
-            batch_time = AverageMeter('batch_time')
-            data_time = AverageMeter('data_time')
-            losses = AverageMeter('losses')
-            top1 = AverageMeter('top1')
-            top5 = AverageMeter('top5')
-            # switch to train mode
-            self.model.train()
-
-            end = time.time()
-            logger.info('warm_up epoch: %d', epoch)
-            for i, (images, labels) in enumerate(data_loader):
-                data_time.update(time.time() - end)
-                # lr
-                T_cur = epoch * nBatch + i
-                warmup_lr = 0.5 * lr_max * (1 + math.cos(math.pi * T_cur / T_total))
-                for param_group in self.model_optim.param_groups:
-                    param_group['lr'] = warmup_lr
-                images, labels = images.to(self.device), labels.to(self.device)
-                # compute output
-                self.mutator.reset_binary_gates() # random sample binary gates
-                self.mutator.unused_modules_off() # remove unused module for speedup
-                output = self.model(images)
-                if self.label_smoothing > 0:
-                    loss = cross_entropy_with_label_smoothing(output, labels, self.label_smoothing)
-                else:
-                    loss = self.criterion(output, labels)
-                # measure accuracy and record loss
-                acc1, acc5 = accuracy(output, labels, topk=(1, 5))
-                losses.update(loss, images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-                # compute gradient and do SGD step
-                self.model.zero_grad()
-                loss.backward()
-                self.model_optim.step()
-                # unused modules back
-                self.mutator.unused_modules_back()
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-                if i % 10 == 0 or i + 1 == nBatch:
-                    batch_log = 'Warmup Train [{0}][{1}/{2}]\t' \
-                                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
-                                'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
-                                'Loss {losses.val:.4f} ({losses.avg:.4f})\t' \
-                                'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t' \
-                                'Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\tlr {lr:.5f}'. \
-                        format(epoch + 1, i, nBatch - 1, batch_time=batch_time, data_time=data_time,
-                               losses=losses, top1=top1, top5=top5, lr=warmup_lr)
-                    logger.info(batch_log)
-            val_loss, val_top1, val_top5 = self._validate()
-            val_log = 'Warmup Valid [{0}/{1}]\tloss {2:.3f}\ttop-1 acc {3:.3f}\ttop-5 acc {4:.3f}\t' \
-                      'Train top-1 {top1.avg:.3f}\ttop-5 {top5.avg:.3f}M'. \
-                format(epoch + 1, self.warmup_epochs, val_loss, val_top1, val_top5, top1=top1, top5=top5)
-            logger.info(val_log)
-            self.save_checkpoint()
-            self.warmup_curr_epoch += 1
-
-    def _get_update_schedule(self, nBatch):
-        """
-        Generate schedule for training architecture weights. Key means after which minibatch
-        to update architecture weights, value means how many steps for the update.
-
-        Parameters
-        ----------
-        nBatch : int
-            the total number of minibatches in one epoch
-
-        Returns
-        -------
-        dict
-            the schedule for updating architecture weights
-        """
-        schedule = {}
-        for i in range(nBatch):
-            if (i + 1) % self.grad_update_arch_param_every == 0:
-                schedule[i] = self.grad_update_steps
-        return schedule
-
-    def _calc_learning_rate(self, epoch, batch=0, nBatch=None):
-        """
-        Update learning rate.
-        """
-        T_total = self.n_epochs * nBatch
-        T_cur = epoch * nBatch + batch
-        lr = 0.5 * self.init_lr * (1 + math.cos(math.pi * T_cur / T_total))
-        return lr
-
-    def _adjust_learning_rate(self, optimizer, epoch, batch=0, nBatch=None):
-        """
-        Adjust learning of a given optimizer and return the new learning rate
-
-        Parameters
-        ----------
-        optimizer : pytorch optimizer
-            the used optimizer
-        epoch : int
-            the current epoch number
-        batch : int
-            the current minibatch
-        nBatch : int
-            the total number of minibatches in one epoch
-
-        Returns
-        -------
-        float
-            the adjusted learning rate
-        """
-        new_lr = self._calc_learning_rate(epoch, batch, nBatch)
-        for param_group in optimizer.param_groups:
-            param_group['lr'] = new_lr
-        return new_lr
-
-    def _train(self):
-        """
-        Train the model, it trains model weights and architecute weights.
-        Architecture weights are trained according to the schedule.
-        Before updating architecture weights, ```requires_grad``` is enabled.
-        Then, it is disabled after the updating, in order not to update
-        architecture weights when training model weights.
-        """
-        nBatch = len(self.train_loader)
-        arch_param_num = self.mutator.num_arch_params()
-        binary_gates_num = self.mutator.num_arch_params()
-        logger.info('#arch_params: %d\t#binary_gates: %d', arch_param_num, binary_gates_num)
-
-        update_schedule = self._get_update_schedule(nBatch)
-
-        for epoch in range(self.train_curr_epoch, self.n_epochs):
-            logger.info('\n--------Train epoch: %d--------\n', epoch + 1)
-            batch_time = AverageMeter('batch_time')
-            data_time = AverageMeter('data_time')
-            losses = AverageMeter('losses')
-            top1 = AverageMeter('top1')
-            top5 = AverageMeter('top5')
-            # switch to train mode
-            self.model.train()
-
-            end = time.time()
-            for i, (images, labels) in enumerate(self.train_loader):
-                data_time.update(time.time() - end)
-                lr = self._adjust_learning_rate(self.model_optim, epoch, batch=i, nBatch=nBatch)
-                # train weight parameters
-                images, labels = images.to(self.device), labels.to(self.device)
-                self.mutator.reset_binary_gates()
-                self.mutator.unused_modules_off()
-                output = self.model(images)
-                if self.label_smoothing > 0:
-                    loss = cross_entropy_with_label_smoothing(output, labels, self.label_smoothing)
-                else:
-                    loss = self.criterion(output, labels)
-                acc1, acc5 = accuracy(output, labels, topk=(1, 5))
-                losses.update(loss, images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-                self.model.zero_grad()
-                loss.backward()
-                self.model_optim.step()
-                self.mutator.unused_modules_back()
-                if epoch > 0:
-                    for _ in range(update_schedule.get(i, 0)):
-                        start_time = time.time()
-                        # GradientArchSearchConfig
-                        self.mutator.arch_requires_grad()
-                        arch_loss, exp_value = self._gradient_step()
-                        self.mutator.arch_disable_grad()
-                        used_time = time.time() - start_time
-                        log_str = 'Architecture [%d-%d]\t Time %.4f\t Loss %.4f\t null %s' % \
-                                    (epoch + 1, i, used_time, arch_loss, exp_value)
-                        logger.info(log_str)
-                batch_time.update(time.time() - end)
-                end = time.time()
-                # training log
-                if i % 10 == 0 or i + 1 == nBatch:
-                    batch_log = 'Train [{0}][{1}/{2}]\t' \
-                                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
-                                'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t' \
-                                'Loss {losses.val:.4f} ({losses.avg:.4f})\t' \
-                                'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})\t' \
-                                'Top-5 acc {top5.val:.3f} ({top5.avg:.3f})\tlr {lr:.5f}'. \
-                        format(epoch + 1, i, nBatch - 1, batch_time=batch_time, data_time=data_time,
-                               losses=losses, top1=top1, top5=top5, lr=lr)
-                    logger.info(batch_log)
-            # validate
-            if (epoch + 1) % self.arch_valid_frequency == 0:
-                val_loss, val_top1, val_top5 = self._validate()
-                val_log = 'Valid [{0}]\tloss {1:.3f}\ttop-1 acc {2:.3f} \ttop-5 acc {3:.3f}\t' \
-                          'Train top-1 {top1.avg:.3f}\ttop-5 {top5.avg:.3f}'. \
-                    format(epoch + 1, val_loss, val_top1, val_top5, top1=top1, top5=top5)
-                logger.info(val_log)
-            self.save_checkpoint()
-            self.train_curr_epoch += 1
-
-    def _valid_next_batch(self):
-        """
-        Get next one minibatch from validation set
-
-        Returns
-        -------
-        (tensor, tensor)
-            the tuple of images and labels
-        """
-        if self._valid_iter is None:
-            self._valid_iter = iter(self.valid_loader)
-        try:
-            data = next(self._valid_iter)
-        except StopIteration:
-            self._valid_iter = iter(self.valid_loader)
-            data = next(self._valid_iter)
-        return data
-
-    def _gradient_step(self):
-        """
-        This gradient step is for updating architecture weights.
-        Mutator is intensively used in this function to operate on
-        architecture weights.
-
-        Returns
-        -------
-        float, None
-            loss of the model, None
-        """
-        # use the same batch size as train batch size for architecture weights
-        self.valid_loader.batch_sampler.batch_size = self.train_batch_size
-        self.valid_loader.batch_sampler.drop_last = True
-        self.model.train()
-        self.mutator.change_forward_mode(self.binary_mode)
-        time1 = time.time()  # time
-        # sample a batch of data from validation set
-        images, labels = self._valid_next_batch()
-        images, labels = images.to(self.device), labels.to(self.device)
-        time2 = time.time()  # time
-        self.mutator.reset_binary_gates()
-        self.mutator.unused_modules_off()
-        output = self.model(images)
-        time3 = time.time()
-        ce_loss = self.criterion(output, labels)
-        expected_value = None
-        loss = ce_loss
-        self.model.zero_grad()
-        loss.backward()
-        self.mutator.set_arch_param_grad()
-        self.arch_optimizer.step()
-        if self.mutator.get_forward_mode() == 'two':
-            self.mutator.rescale_updated_arch_param()
-        self.mutator.unused_modules_back()
-        self.mutator.change_forward_mode(None)
-        time4 = time.time()
-        logger.info('(%.4f, %.4f, %.4f)', time2 - time1, time3 - time2, time4 - time3)
-        return loss.data.item(), expected_value.item() if expected_value is not None else None
-
-    def save_checkpoint(self):
-        """
-        Save checkpoint of the whole model. Saving model weights and architecture weights in
-        ```ckpt_path```, and saving currently chosen architecture in ```arch_path```.
-        """
-        if self.ckpt_path:
-            state = {
-                'warmup_curr_epoch': self.warmup_curr_epoch,
-                'train_curr_epoch': self.train_curr_epoch,
-                'model': self.model.state_dict(),
-                'optim': self.model_optim.state_dict(),
-                'arch_optim': self.arch_optimizer.state_dict()
-            }
-            torch.save(state, self.ckpt_path)
-        if self.arch_path:
-            self.export(self.arch_path)
-
-    def load_checkpoint(self):
-        """
-        Load the checkpoint from ```ckpt_path```.
-        """
-        assert self.ckpt_path is not None, "If load_ckpt is not None, ckpt_path should not be None"
-        ckpt = torch.load(self.ckpt_path)
-        self.warmup_curr_epoch = ckpt['warmup_curr_epoch']
-        self.train_curr_epoch = ckpt['train_curr_epoch']
-        self.model.load_state_dict(ckpt['model'])
-        self.model_optim.load_state_dict(ckpt['optim'])
-        self.arch_optimizer.load_state_dict(ckpt['arch_optim'])
-
-    def train(self):
-        """
-        Train the whole model.
-        """
-        if self.load_ckpt:
-            self.load_checkpoint()
-        if self.warmup:
-            self._warm_up()
-        self._train()
-
-    def export(self, file_name):
-        """
-        Export the chosen architecture into a file
-
-        Parameters
-        ----------
-        file_name : str
-            the file that stores exported chosen architecture
-        """
-        exported_arch = self.mutator.sample_final()
-        with open(file_name, 'w') as f:
-            json.dump(exported_arch, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-
-    def validate(self):
-        raise NotImplementedError
-
-    def checkpoint(self):
-        raise NotImplementedError
diff --git a/nni/algorithms/nas/pytorch/proxylessnas/utils.py b/nni/algorithms/nas/pytorch/proxylessnas/utils.py
deleted file mode 100644
index c532efc04..000000000
--- a/nni/algorithms/nas/pytorch/proxylessnas/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-
-def detach_variable(inputs):
-    """
-    Detach variables
-
-    Parameters
-    ----------
-    inputs : pytorch tensors
-        pytorch tensors
-    """
-    if isinstance(inputs, tuple):
-        return tuple([detach_variable(x) for x in inputs])
-    else:
-        x = inputs.detach()
-        x.requires_grad = inputs.requires_grad
-        return x
-
-def cross_entropy_with_label_smoothing(pred, target, label_smoothing=0.1):
-    """
-    Parameters
-    ----------
-    pred : pytorch tensor
-        predicted value
-    target : pytorch tensor
-        label
-    label_smoothing : float
-        the degree of label smoothing
-
-    Returns
-    -------
-    pytorch tensor
-        cross entropy
-    """
-    logsoftmax = nn.LogSoftmax()
-    n_classes = pred.size(1)
-    # convert to one-hot
-    target = torch.unsqueeze(target, 1)
-    soft_target = torch.zeros_like(pred)
-    soft_target.scatter_(1, target, 1)
-    # label smoothing
-    soft_target = soft_target * (1 - label_smoothing) + label_smoothing / n_classes
-    return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))
-
-def accuracy(output, target, topk=(1,)):
-    """
-    Computes the precision@k for the specified values of k
-
-    Parameters
-    ----------
-    output : pytorch tensor
-        output, e.g., predicted value
-    target : pytorch tensor
-        label
-    topk : tuple
-        specify top1 and top5
-
-    Returns
-    -------
-    list
-        accuracy of top1 and top5
-    """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
diff --git a/nni/algorithms/nas/pytorch/random/__init__.py b/nni/algorithms/nas/pytorch/random/__init__.py
deleted file mode 100644
index 0ff4a7795..000000000
--- a/nni/algorithms/nas/pytorch/random/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import RandomMutator
diff --git a/nni/algorithms/nas/pytorch/random/mutator.py b/nni/algorithms/nas/pytorch/random/mutator.py
deleted file mode 100644
index 22ecc0831..000000000
--- a/nni/algorithms/nas/pytorch/random/mutator.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn.functional as F
-
-from nni.nas.pytorch.mutator import Mutator
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice
-
-
-class RandomMutator(Mutator):
-    """
-    Random mutator that samples a random candidate in the search space each time ``reset()``.
-    It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior.
-    """
-
-    def sample_search(self):
-        """
-        Sample a random candidate.
-        """
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                gen_index = torch.randint(high=len(mutable), size=(1, ))
-                result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool()
-            elif isinstance(mutable, InputChoice):
-                if mutable.n_chosen is None:
-                    result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool()
-                else:
-                    perm = torch.randperm(mutable.n_candidates)
-                    mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)]
-                    result[mutable.key] = torch.tensor(mask, dtype=torch.bool)  # pylint: disable=not-callable
-        return result
-
-    def sample_final(self):
-        """
-        Same as :meth:`sample_search`.
-        """
-        return self.sample_search()
diff --git a/nni/algorithms/nas/pytorch/spos/__init__.py b/nni/algorithms/nas/pytorch/spos/__init__.py
deleted file mode 100644
index ed432b084..000000000
--- a/nni/algorithms/nas/pytorch/spos/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .evolution import SPOSEvolution
-from .mutator import SPOSSupernetTrainingMutator
-from .trainer import SPOSSupernetTrainer
diff --git a/nni/algorithms/nas/pytorch/spos/evolution.py b/nni/algorithms/nas/pytorch/spos/evolution.py
deleted file mode 100644
index bd099e276..000000000
--- a/nni/algorithms/nas/pytorch/spos/evolution.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-import os
-import re
-from collections import deque
-
-import numpy as np
-from nni.tuner import Tuner
-from nni.algorithms.nas.pytorch.classic_nas.mutator import LAYER_CHOICE, INPUT_CHOICE
-
-
-_logger = logging.getLogger(__name__)
-
-
-class SPOSEvolution(Tuner):
-    """
-    SPOS evolution tuner.
-
-    Parameters
-    ----------
-    max_epochs : int
-        Maximum number of epochs to run.
-    num_select : int
-        Number of survival candidates of each epoch.
-    num_population : int
-        Number of candidates at the start of each epoch. If candidates generated by
-        crossover and mutation are not enough, the rest will be filled with random
-        candidates.
-    m_prob : float
-        The probability of mutation.
-    num_crossover : int
-        Number of candidates generated by crossover in each epoch.
-    num_mutation : int
-        Number of candidates generated by mutation in each epoch.
-    """
-
-    def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1,
-                 num_crossover=25, num_mutation=25):
-        assert num_population >= num_select
-        self.max_epochs = max_epochs
-        self.num_select = num_select
-        self.num_population = num_population
-        self.m_prob = m_prob
-        self.num_crossover = num_crossover
-        self.num_mutation = num_mutation
-        self.epoch = 0
-        self.candidates = []
-        self.search_space = None
-        self.random_state = np.random.RandomState(0)
-
-        # async status
-        self._to_evaluate_queue = deque()
-        self._sending_parameter_queue = deque()
-        self._pending_result_ids = set()
-        self._reward_dict = dict()
-        self._id2candidate = dict()
-        self._st_callback = None
-
-    def update_search_space(self, search_space):
-        """
-        Handle the initialization/update event of search space.
-        """
-        self._search_space = search_space
-        self._next_round()
-
-    def _next_round(self):
-        _logger.info("Epoch %d, generating...", self.epoch)
-        if self.epoch == 0:
-            self._get_random_population()
-            self.export_results(self.candidates)
-        else:
-            best_candidates = self._select_top_candidates()
-            self.export_results(best_candidates)
-            if self.epoch >= self.max_epochs:
-                return
-            self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates)
-            self._get_random_population()
-        self.epoch += 1
-
-    def _random_candidate(self):
-        chosen_arch = dict()
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                index = self.random_state.randint(len(choices))
-                chosen_arch[key] = {"_value": choices[index], "_idx": index}
-            elif val["_type"] == INPUT_CHOICE:
-                raise NotImplementedError("Input choice is not implemented yet.")
-        return chosen_arch
-
-    def _add_to_evaluate_queue(self, cand):
-        _logger.info("Generate candidate %s, adding to eval queue.", self._get_architecture_repr(cand))
-        self._reward_dict[self._hashcode(cand)] = 0.
-        self._to_evaluate_queue.append(cand)
-
-    def _get_random_population(self):
-        while len(self.candidates) < self.num_population:
-            cand = self._random_candidate()
-            if self._is_legal(cand):
-                _logger.info("Random candidate generated.")
-                self._add_to_evaluate_queue(cand)
-                self.candidates.append(cand)
-
-    def _get_crossover(self, best):
-        result = []
-        for _ in range(10 * self.num_crossover):
-            cand_p1 = best[self.random_state.randint(len(best))]
-            cand_p2 = best[self.random_state.randint(len(best))]
-            assert cand_p1.keys() == cand_p2.keys()
-            cand = {k: cand_p1[k] if self.random_state.randint(2) == 0 else cand_p2[k]
-                    for k in cand_p1.keys()}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_crossover:
-                break
-        _logger.info("Found %d architectures with crossover.", len(result))
-        return result
-
-    def _get_mutation(self, best):
-        result = []
-        for _ in range(10 * self.num_mutation):
-            cand = best[self.random_state.randint(len(best))].copy()
-            mutation_sample = np.random.random_sample(len(cand))
-            for s, k in zip(mutation_sample, cand):
-                if s < self.m_prob:
-                    choices = self._search_space[k]["_value"]
-                    index = self.random_state.randint(len(choices))
-                    cand[k] = {"_value": choices[index], "_idx": index}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_mutation:
-                break
-        _logger.info("Found %d architectures with mutation.", len(result))
-        return result
-
-    def _get_architecture_repr(self, cand):
-        return re.sub(r"\".*?\": \{\"_idx\": (\d+), \"_value\": \".*?\"\}", r"\1",
-                      self._hashcode(cand))
-
-    def _is_legal(self, cand):
-        if self._hashcode(cand) in self._reward_dict:
-            return False
-        return True
-
-    def _select_top_candidates(self):
-        reward_query = lambda cand: self._reward_dict[self._hashcode(cand)]
-        _logger.info("All candidate rewards: %s", list(map(reward_query, self.candidates)))
-        result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select]
-        _logger.info("Best candidate rewards: %s", list(map(reward_query, result)))
-        return result
-
-    @staticmethod
-    def _hashcode(d):
-        return json.dumps(d, sort_keys=True)
-
-    def _bind_and_send_parameters(self):
-        """
-        There are two types of resources: parameter ids and candidates. This function is called at
-        necessary times to bind these resources to send new trials with st_callback.
-        """
-        result = []
-        while self._sending_parameter_queue and self._to_evaluate_queue:
-            parameter_id = self._sending_parameter_queue.popleft()
-            parameters = self._to_evaluate_queue.popleft()
-            self._id2candidate[parameter_id] = parameters
-            result.append(parameters)
-            self._pending_result_ids.add(parameter_id)
-            self._st_callback(parameter_id, parameters)
-            _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters))
-        return result
-
-    def generate_multiple_parameters(self, parameter_id_list, **kwargs):
-        """
-        Callback function necessary to implement a tuner. This will put more parameter ids into the
-        parameter id queue.
-        """
-        if "st_callback" in kwargs and self._st_callback is None:
-            self._st_callback = kwargs["st_callback"]
-        for parameter_id in parameter_id_list:
-            self._sending_parameter_queue.append(parameter_id)
-        self._bind_and_send_parameters()
-        return []  # always not use this. might induce problem of over-sending
-
-    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
-        """
-        Callback function. Receive a trial result.
-        """
-        _logger.info("Candidate %d, reported reward %f", parameter_id, value)
-        self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value
-
-    def trial_end(self, parameter_id, success, **kwargs):
-        """
-        Callback function when a trial is ended and resource is released.
-        """
-        self._pending_result_ids.remove(parameter_id)
-        if not self._pending_result_ids and not self._to_evaluate_queue:
-            # a new epoch now
-            self._next_round()
-            assert self._st_callback is not None
-            self._bind_and_send_parameters()
-
-    def export_results(self, result):
-        """
-        Export a number of candidates to `checkpoints` dir.
-
-        Parameters
-        ----------
-        result : dict
-            Chosen architectures to be exported.
-        """
-        os.makedirs("checkpoints", exist_ok=True)
-        for i, cand in enumerate(result):
-            converted = dict()
-            for cand_key, cand_val in cand.items():
-                onehot = [k == cand_val["_idx"] for k in range(len(self._search_space[cand_key]["_value"]))]
-                converted[cand_key] = onehot
-            with open(os.path.join("checkpoints", "%03d_%03d.json" % (self.epoch, i)), "w") as fp:
-                json.dump(converted, fp)
diff --git a/nni/algorithms/nas/pytorch/spos/mutator.py b/nni/algorithms/nas/pytorch/spos/mutator.py
deleted file mode 100644
index 1a803cb2e..000000000
--- a/nni/algorithms/nas/pytorch/spos/mutator.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import numpy as np
-from nni.algorithms.nas.pytorch.random import RandomMutator
-
-_logger = logging.getLogger(__name__)
-
-
-class SPOSSupernetTrainingMutator(RandomMutator):
-    """
-    A random mutator with flops limit.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model.
-    flops_func : callable
-        Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func`
-        is None, functions related to flops will be deactivated.
-    flops_lb : number
-        Lower bound of flops.
-    flops_ub : number
-        Upper bound of flops.
-    flops_bin_num : number
-        Number of bins divided for the interval of flops to ensure the uniformity. Bigger number will be more
-        uniform, but the sampling will be slower.
-    flops_sample_timeout : int
-        Maximum number of attempts to sample before giving up and use a random candidate.
-    """
-    def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None,
-                 flops_bin_num=7, flops_sample_timeout=500):
-
-        super().__init__(model)
-        self._flops_func = flops_func
-        if self._flops_func is not None:
-            self._flops_bin_num = flops_bin_num
-            self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num + 1)]
-            self._flops_sample_timeout = flops_sample_timeout
-
-    def sample_search(self):
-        """
-        Sample a candidate for training. When `flops_func` is not None, candidates will be sampled uniformly
-        relative to flops.
-
-        Returns
-        -------
-        dict
-        """
-        if self._flops_func is not None:
-            for times in range(self._flops_sample_timeout):
-                idx = np.random.randint(self._flops_bin_num)
-                cand = super().sample_search()
-                if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]:
-                    _logger.debug("Sampled candidate flops %f in %d times.", cand, times)
-                    return cand
-            _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout)
-        return super().sample_search()
-
-    def sample_final(self):
-        """
-        Implement only to suffice the interface of Mutator.
-        """
-        return self.sample_search()
diff --git a/nni/algorithms/nas/pytorch/spos/trainer.py b/nni/algorithms/nas/pytorch/spos/trainer.py
deleted file mode 100644
index 7c954e2ad..000000000
--- a/nni/algorithms/nas/pytorch/spos/trainer.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import torch
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup
-
-from .mutator import SPOSSupernetTrainingMutator
-
-logger = logging.getLogger(__name__)
-
-
-class SPOSSupernetTrainer(Trainer):
-    """
-    This trainer trains a supernet that can be used for evolution search.
-
-    Parameters
-    ----------
-    model : nn.Module
-        Model with mutables.
-    mutator : nni.nas.pytorch.mutator.Mutator
-        A mutator object that has been initialized with the model.
-    loss : callable
-        Called with logits and targets. Returns a loss tensor.
-    metrics : callable
-        Returns a dict that maps metrics keys to metrics data.
-    optimizer : Optimizer
-        Optimizer that optimizes the model.
-    num_epochs : int
-        Number of epochs of training.
-    train_loader : iterable
-        Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
-    dataset_valid : iterable
-        Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
-    batch_size : int
-        Batch size.
-    workers: int
-        Number of threads for data preprocessing. Not used for this trainer. Maybe removed in future.
-    device : torch.device
-        Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
-        automatic detects GPU and selects GPU first.
-    log_frequency : int
-        Number of mini-batches to log metrics.
-    callbacks : list of Callback
-        Callbacks to plug into the trainer. See Callbacks.
-    """
-
-    def __init__(self, model, loss, metrics,
-                 optimizer, num_epochs, train_loader, valid_loader,
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
-                 callbacks=None):
-        assert torch.cuda.is_available()
-        super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model),
-                         loss, metrics, optimizer, num_epochs, None, None,
-                         batch_size, workers, device, log_frequency, callbacks)
-
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-
-    def train_one_epoch(self, epoch):
-        self.model.train()
-        meters = AverageMeterGroup()
-        for step, (x, y) in enumerate(self.train_loader):
-            x, y = x.to(self.device), y.to(self.device)
-            self.optimizer.zero_grad()
-            self.mutator.reset()
-            logits = self.model(x)
-            loss = self.loss(logits, y)
-            loss.backward()
-            self.optimizer.step()
-
-            metrics = self.metrics(logits, y)
-            metrics["loss"] = loss.item()
-            meters.update(metrics)
-            if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch + 1,
-                            self.num_epochs, step + 1, len(self.train_loader), meters)
-
-    def validate_one_epoch(self, epoch):
-        self.model.eval()
-        meters = AverageMeterGroup()
-        with torch.no_grad():
-            for step, (x, y) in enumerate(self.valid_loader):
-                x, y = x.to(self.device), y.to(self.device)
-                self.mutator.reset()
-                logits = self.model(x)
-                loss = self.loss(logits, y)
-                metrics = self.metrics(logits, y)
-                metrics["loss"] = loss.item()
-                meters.update(metrics)
-                if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [%s/%s] Validation Step [%s/%s]  %s", epoch + 1,
-                                self.num_epochs, step + 1, len(self.valid_loader), meters)
diff --git a/nni/algorithms/nas/tensorflow/__init__.py b/nni/algorithms/nas/tensorflow/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py b/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
deleted file mode 100644
index ec3f5a489..000000000
--- a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import get_and_apply_next_architecture
diff --git a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py b/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
deleted file mode 100644
index cb089c49b..000000000
--- a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import json
-import logging
-import os
-import sys
-
-import tensorflow as tf
-
-import nni
-from nni.runtime.env_vars import trial_env_vars
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-from nni.nas.tensorflow.mutator import Mutator
-
-logger = logging.getLogger(__name__)
-
-NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE"
-LAYER_CHOICE = "layer_choice"
-INPUT_CHOICE = "input_choice"
-
-
-def get_and_apply_next_architecture(model):
-    """
-    Wrapper of :class:`~nni.nas.tensorflow.classic_nas.mutator.ClassicMutator` to make it more meaningful,
-    similar to ``get_next_parameter`` for HPO.
-    Tt will generate search space based on ``model``.
-    If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for
-    generating search space for the experiment.
-    If not, there are still two mode, one is nni experiment mode where users
-    use ``nnictl`` to start an experiment. The other is standalone mode
-    where users directly run the trial command, this mode chooses the first
-    one(s) for each LayerChoice and InputChoice.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-    ClassicMutator(model)
-
-
-class ClassicMutator(Mutator):
-    """
-    This mutator is to apply the architecture chosen from tuner.
-    It implements the forward function of LayerChoice and InputChoice,
-    to only activate the chosen ones.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-
-    def __init__(self, model):
-        super(ClassicMutator, self).__init__(model)
-        self._chosen_arch = {}
-        self._search_space = self._generate_search_space()
-        if NNI_GEN_SEARCH_SPACE in os.environ:
-            # dry run for only generating search space
-            self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE])
-            sys.exit(0)
-
-        if trial_env_vars.NNI_PLATFORM is None:
-            logger.warning("This is in standalone mode, the chosen are the first one(s).")
-            self._chosen_arch = self._standalone_generate_chosen()
-        else:
-            # get chosen arch from tuner
-            self._chosen_arch = nni.get_next_parameter()
-            if self._chosen_arch is None:
-                if trial_env_vars.NNI_PLATFORM == "unittest":
-                    # happens if NNI_PLATFORM is intentionally set, e.g., in UT
-                    logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.")
-                    self._chosen_arch = self._standalone_generate_chosen()
-                else:
-                    raise RuntimeError("Chosen architecture is None. This may be a platform error.")
-        self.reset()
-
-    def _sample_layer_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert layer choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        # doesn't support multihot for layer choice yet
-        assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \
-            "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value)
-        mask = tf.one_hot(idx, len(mutable))
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-
-    def _sample_input_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert input choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        candidate_repr = search_space_item["candidates"]
-        multihot_list = [False] * mutable.n_candidates
-        for i, v in zip(idx, value):
-            assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
-                "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
-            assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
-            multihot_list[i] = True
-        return tf.cast(multihot_list, tf.bool)  # pylint: disable=not-callable
-
-    def sample_search(self):
-        """
-        See :meth:`sample_final`.
-        """
-        return self.sample_final()
-
-    def sample_final(self):
-        """
-        Convert the chosen arch and apply it on model.
-        """
-        assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \
-            "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(),
-                                                                                       self._chosen_arch.keys())
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, (LayerChoice, InputChoice)):
-                assert mutable.key in self._chosen_arch, \
-                    "Expected '{}' in chosen arch, but not found.".format(mutable.key)
-                data = self._chosen_arch[mutable.key]
-                assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
-                    "'{}' is not a valid choice.".format(data)
-            if isinstance(mutable, LayerChoice):
-                result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, InputChoice):
-                result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return result
-
-    def _standalone_generate_chosen(self):
-        """
-        Generate the chosen architecture for standalone mode,
-        i.e., choose the first one(s) for LayerChoice and InputChoice.
-        ::
-            { key_name: {"_value": "conv1",
-                         "_idx": 0} }
-            { key_name: {"_value": ["in1"],
-                         "_idx": [0]} }
-        Returns
-        -------
-        dict
-            the chosen architecture
-        """
-        chosen_arch = {}
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                chosen_arch[key] = {"_value": choices[0], "_idx": 0}
-            elif val["_type"] == INPUT_CHOICE:
-                choices = val["_value"]["candidates"]
-                n_chosen = val["_value"]["n_chosen"]
-                if n_chosen is None:
-                    n_chosen = len(choices)
-                chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
-            else:
-                raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
-        return chosen_arch
-
-    def _generate_search_space(self):
-        """
-        Generate search space from mutables.
-        Here is the search space format:
-        ::
-            { key_name: {"_type": "layer_choice",
-                         "_value": ["conv1", "conv2"]} }
-            { key_name: {"_type": "input_choice",
-                         "_value": {"candidates": ["in1", "in2"],
-                                    "n_chosen": 1}} }
-        Returns
-        -------
-        dict
-            the generated search space
-        """
-        search_space = {}
-        for mutable in self.mutables:
-            # for now we only generate flattened search space
-            if isinstance(mutable, LayerChoice):
-                key = mutable.key
-                val = mutable.names
-                search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
-            elif isinstance(mutable, InputChoice):
-                key = mutable.key
-                search_space[key] = {"_type": INPUT_CHOICE,
-                                     "_value": {"candidates": mutable.choose_from,
-                                                "n_chosen": mutable.n_chosen}}
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return search_space
-
-    def _dump_search_space(self, file_path):
-        with open(file_path, "w") as ss_file:
-            json.dump(self._search_space, ss_file, sort_keys=True, indent=2)
diff --git a/nni/algorithms/nas/tensorflow/enas/__init__.py b/nni/algorithms/nas/tensorflow/enas/__init__.py
deleted file mode 100644
index d3372836e..000000000
--- a/nni/algorithms/nas/tensorflow/enas/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import EnasMutator
-from .trainer import EnasTrainer
diff --git a/nni/algorithms/nas/tensorflow/enas/mutator.py b/nni/algorithms/nas/tensorflow/enas/mutator.py
deleted file mode 100644
index 313c81cc9..000000000
--- a/nni/algorithms/nas/tensorflow/enas/mutator.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import tensorflow as tf
-from tensorflow.keras.layers import Dense, Embedding, LSTMCell, RNN
-from tensorflow.keras.losses import SparseCategoricalCrossentropy, Reduction
-
-from nni.nas.tensorflow.mutator import Mutator
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-
-
-class EnasMutator(Mutator):
-    def __init__(self, model,
-                 lstm_size=64,
-                 lstm_num_layers=1,
-                 tanh_constant=1.5,
-                 cell_exit_extra_step=False,
-                 skip_target=0.4,
-                 temperature=None,
-                 branch_bias=0.25,
-                 entropy_reduction='sum'):
-        super().__init__(model)
-        self.tanh_constant = tanh_constant
-        self.temperature = temperature
-        self.cell_exit_extra_step = cell_exit_extra_step
-
-        cells = [LSTMCell(units=lstm_size, use_bias=False) for _ in range(lstm_num_layers)]
-        self.lstm = RNN(cells, stateful=True)
-        self.g_emb = tf.random.normal((1, 1, lstm_size)) * 0.1
-        self.skip_targets = tf.constant([1.0 - skip_target, skip_target])
-
-        self.max_layer_choice = 0
-        self.bias_dict = {}
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                if self.max_layer_choice == 0:
-                    self.max_layer_choice = len(mutable)
-                assert self.max_layer_choice == len(mutable), \
-                        "ENAS mutator requires all layer choice have the same number of candidates."
-                if 'reduce' in mutable.key:
-                    bias = []
-                    for choice in mutable.choices:
-                        if 'conv' in str(type(choice)).lower():
-                            bias.append(branch_bias)
-                        else:
-                            bias.append(-branch_bias)
-                    self.bias_dict[mutable.key] = tf.constant(bias)
-
-        # exposed for trainer
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-
-        # internal nn layers
-        self.embedding = Embedding(self.max_layer_choice + 1, lstm_size)
-        self.soft = Dense(self.max_layer_choice, use_bias=False)
-        self.attn_anchor = Dense(lstm_size, use_bias=False)
-        self.attn_query = Dense(lstm_size, use_bias=False)
-        self.v_attn = Dense(1, use_bias=False)
-        assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.'
-        self.entropy_reduction = tf.reduce_sum if entropy_reduction == 'sum' else tf.reduce_mean
-        self.cross_entropy_loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE)
-
-        self._first_sample = True
-
-    def sample_search(self):
-        self._initialize()
-        self._sample(self.mutables)
-        self._first_sample = False
-        return self._choices
-
-    def sample_final(self):
-        return self.sample_search()
-
-    def _sample(self, tree):
-        mutable = tree.mutable
-        if isinstance(mutable, LayerChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_layer_choice(mutable)
-        elif isinstance(mutable, InputChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_input_choice(mutable)
-        for child in tree.children:
-            self._sample(child)
-        if self.cell_exit_extra_step and isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid:
-            self._anchors_hid[mutable.key] = self.lstm(self._inputs, 1)
-
-    def _initialize(self):
-        self._choices = {}
-        self._anchors_hid = {}
-        self._inputs = self.g_emb
-        # seems the `input_shape` parameter of RNN does not work
-        # workaround it by omitting `reset_states` for first run
-        if not self._first_sample:
-            self.lstm.reset_states()
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-
-    def _sample_layer_choice(self, mutable):
-        logit = self.soft(self.lstm(self._inputs))
-        if self.temperature is not None:
-            logit /= self.temperature
-        if self.tanh_constant is not None:
-            logit = self.tanh_constant * tf.tanh(logit)
-        if mutable.key in self.bias_dict:
-            logit += self.bias_dict[mutable.key]
-        softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-        branch_id = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [1])
-        log_prob = self.cross_entropy_loss(branch_id, logit)
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.math.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        self._inputs = tf.reshape(self.embedding(branch_id), [1, 1, -1])
-        mask = tf.one_hot(branch_id, self.max_layer_choice)
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-
-    def _sample_input_choice(self, mutable):
-        query, anchors = [], []
-        for label in mutable.choose_from:
-            if label not in self._anchors_hid:
-                self._anchors_hid[label] = self.lstm(self._inputs)
-            query.append(self.attn_anchor(self._anchors_hid[label]))
-            anchors.append(self._anchors_hid[label])
-        query = tf.concat(query, axis=0)
-        query = tf.tanh(query + self.attn_query(anchors[-1]))
-        query = self.v_attn(query)
-
-        if self.temperature is not None:
-            query /= self.temperature
-        if self.tanh_constant is not None:
-            query = self.tanh_constant * tf.tanh(query)
-
-        if mutable.n_chosen is None:
-            logit = tf.concat([-query, query], axis=1)
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            skip = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip_prob = tf.math.sigmoid(logit)
-            kl = tf.reduce_sum(skip_prob * tf.math.log(skip_prob / self.skip_targets))
-            self.sample_skip_penalty += kl
-            log_prob = self.cross_entropy_loss(skip, logit)
-
-            skip = tf.cast(skip, tf.float32)
-            inputs = tf.tensordot(skip, tf.concat(anchors, 0), 1) / (1. + tf.reduce_sum(skip))
-            self._inputs = tf.reshape(inputs, [1, 1, -1])
-
-        else:
-            assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS."
-            logit = tf.reshape(query, [1, -1])
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            index = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip = tf.reshape(tf.one_hot(index, mutable.n_candidates), [-1])
-            # when the size is 1, tf does not accept tensor here, complaining the shape is wrong
-            # but using a numpy array seems fine
-            log_prob = self.cross_entropy_loss(logit, query.numpy())
-            self._inputs = tf.reshape(anchors[index.numpy()[0]], [1, 1, -1])
-
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        assert len(skip) == mutable.n_candidates, (skip, mutable.n_candidates, mutable.n_chosen)
-        return tf.cast(skip, tf.bool)
diff --git a/nni/algorithms/nas/tensorflow/enas/trainer.py b/nni/algorithms/nas/tensorflow/enas/trainer.py
deleted file mode 100644
index 67df9c7f9..000000000
--- a/nni/algorithms/nas/tensorflow/enas/trainer.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import logging
-
-import tensorflow as tf
-from tensorflow.keras.optimizers import Adam
-
-from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
-
-from .mutator import EnasMutator
-
-logger = logging.getLogger(__name__)
-
-
-class EnasTrainer:
-    def __init__(
-        self,
-        model,
-        loss,
-        metrics,
-        reward_function,
-        optimizer,
-        batch_size,
-        num_epochs,
-        dataset_train,
-        dataset_valid,
-        log_frequency=100,
-        entropy_weight=0.0001,
-        skip_weight=0.8,
-        baseline_decay=0.999,
-        child_steps=500,
-        mutator_lr=0.00035,
-        mutator_steps=50,
-        mutator_steps_aggregate=20,
-        aux_weight=0.4,
-        test_arc_per_epoch=1,
-    ):
-        self.model = model
-        self.loss = loss
-        self.metrics = metrics
-        self.reward_function = reward_function
-        self.optimizer = optimizer
-        self.batch_size = batch_size
-        self.num_epochs = num_epochs
-
-        x, y = dataset_train
-        split = int(len(x) * 0.9)
-        self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
-        self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
-        self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)
-
-        self.log_frequency = log_frequency
-        self.entropy_weight = entropy_weight
-        self.skip_weight = skip_weight
-        self.baseline_decay = baseline_decay
-        self.child_steps = child_steps
-        self.mutator_lr = mutator_lr
-        self.mutator_steps = mutator_steps
-        self.mutator_steps_aggregate = mutator_steps_aggregate
-        self.aux_weight = aux_weight
-        self.test_arc_per_epoch = test_arc_per_epoch
-
-        self.mutator = EnasMutator(model)
-        self.mutator_optim = Adam(learning_rate=self.mutator_lr)
-
-        self.baseline = 0.0
-
-    def train(self, validate=True):
-        for epoch in range(self.num_epochs):
-            logger.info("Epoch %d Training", epoch + 1)
-            self.train_one_epoch(epoch)
-            logger.info("Epoch %d Validating", epoch + 1)
-            self.validate_one_epoch(epoch)
-
-    def validate(self):
-        self.validate_one_epoch(-1)
-
-    def train_one_epoch(self, epoch):
-        train_loader, valid_loader = self._create_train_loader()
-
-        # Sample model and train
-        meters = AverageMeterGroup()
-
-        for step in range(1, self.child_steps + 1):
-            x, y = next(train_loader)
-            self.mutator.reset()
-
-            with tf.GradientTape() as tape:
-                logits = self.model(x, training=True)
-                if isinstance(logits, tuple):
-                    logits, aux_logits = logits
-                    aux_loss = self.loss(aux_logits, y)
-                else:
-                    aux_loss = 0.0
-                metrics = self.metrics(y, logits)
-                loss = self.loss(y, logits) + self.aux_weight * aux_loss
-
-            grads = tape.gradient(loss, self.model.trainable_weights)
-            grads = fill_zero_grads(grads, self.model.trainable_weights)
-            grads, _ = tf.clip_by_global_norm(grads, 5.0)
-            self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
-
-            metrics["loss"] = tf.reduce_mean(loss).numpy()
-            meters.update(metrics)
-
-            if self.log_frequency and step % self.log_frequency == 0:
-                logger.info(
-                    "Model Epoch [%d/%d] Step [%d/%d]  %s",
-                    epoch + 1,
-                    self.num_epochs,
-                    step,
-                    self.child_steps,
-                    meters,
-                )
-
-        # Train sampler (mutator)
-        meters = AverageMeterGroup()
-        for mutator_step in range(1, self.mutator_steps + 1):
-            grads_list = []
-            for step in range(1, self.mutator_steps_aggregate + 1):
-                with tf.GradientTape() as tape:
-                    x, y = next(valid_loader)
-                    self.mutator.reset()
-
-                    logits = self.model(x, training=False)
-                    metrics = self.metrics(y, logits)
-                    reward = (
-                        self.reward_function(y, logits)
-                        + self.entropy_weight * self.mutator.sample_entropy
-                    )
-                    self.baseline = self.baseline * self.baseline_decay + reward * (
-                        1 - self.baseline_decay
-                    )
-                    loss = self.mutator.sample_log_prob * (reward - self.baseline)
-                    loss += self.skip_weight * self.mutator.sample_skip_penalty
-
-                    meters.update(
-                        {
-                            "reward": reward,
-                            "loss": tf.reduce_mean(loss).numpy(),
-                            "ent": self.mutator.sample_entropy.numpy(),
-                            "log_prob": self.mutator.sample_log_prob.numpy(),
-                            "baseline": self.baseline,
-                            "skip": self.mutator.sample_skip_penalty,
-                        }
-                    )
-
-                    cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate
-                    if self.log_frequency and cur_step % self.log_frequency == 0:
-                        logger.info(
-                            "RL Epoch [%d/%d] Step [%d/%d] [%d/%d]  %s",
-                            epoch + 1,
-                            self.num_epochs,
-                            mutator_step,
-                            self.mutator_steps,
-                            step,
-                            self.mutator_steps_aggregate,
-                            meters,
-                        )
-
-                grads = tape.gradient(loss, self.mutator.trainable_weights)
-                grads = fill_zero_grads(grads, self.mutator.trainable_weights)
-                grads_list.append(grads)
-            total_grads = [
-                tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list)
-            ]
-            total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0)
-            self.mutator_optim.apply_gradients(
-                zip(total_grads, self.mutator.trainable_weights)
-            )
-
-    def validate_one_epoch(self, epoch):
-        test_loader = self._create_validate_loader()
-
-        for arc_id in range(self.test_arc_per_epoch):
-            meters = AverageMeterGroup()
-            for x, y in test_loader:
-                self.mutator.reset()
-                logits = self.model(x, training=False)
-                if isinstance(logits, tuple):
-                    logits, _ = logits
-                metrics = self.metrics(y, logits)
-                loss = self.loss(y, logits)
-                metrics["loss"] = tf.reduce_mean(loss).numpy()
-                meters.update(metrics)
-
-            logger.info(
-                "Test Epoch [%d/%d] Arc [%d/%d] Summary  %s",
-                epoch + 1,
-                self.num_epochs,
-                arc_id + 1,
-                self.test_arc_per_epoch,
-                meters.summary(),
-            )
-
-    def _create_train_loader(self):
-        train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
-        test_set = self.valid_set.shuffle(1000000).repeat().batch(self.batch_size)
-        return iter(train_set), iter(test_set)
-
-    def _create_validate_loader(self):
-        return iter(self.test_set.shuffle(1000000).batch(self.batch_size))
diff --git a/nni/retiarii/evaluator/functional.py b/nni/nas/evaluator/functional.py
similarity index 100%
rename from nni/retiarii/evaluator/functional.py
rename to nni/nas/evaluator/functional.py
diff --git a/nni/retiarii/evaluator/pytorch/cgo/evaluator.py b/nni/nas/evaluator/pytorch/cgo/evaluator.py
similarity index 100%
rename from nni/retiarii/evaluator/pytorch/cgo/evaluator.py
rename to nni/nas/evaluator/pytorch/cgo/evaluator.py
diff --git a/nni/retiarii/evaluator/pytorch/cgo/trainer.py b/nni/nas/evaluator/pytorch/cgo/trainer.py
similarity index 100%
rename from nni/retiarii/evaluator/pytorch/cgo/trainer.py
rename to nni/nas/evaluator/pytorch/cgo/trainer.py
diff --git a/nni/retiarii/evaluator/pytorch/lightning.py b/nni/nas/evaluator/pytorch/lightning.py
similarity index 100%
rename from nni/retiarii/evaluator/pytorch/lightning.py
rename to nni/nas/evaluator/pytorch/lightning.py
diff --git a/nni/retiarii/execution/api.py b/nni/nas/execution/api.py
similarity index 100%
rename from nni/retiarii/execution/api.py
rename to nni/nas/execution/api.py
diff --git a/nni/retiarii/execution/interface.py b/nni/nas/execution/common/engine.py
similarity index 100%
rename from nni/retiarii/execution/interface.py
rename to nni/nas/execution/common/engine.py
diff --git a/nni/retiarii/graph.py b/nni/nas/execution/common/graph.py
similarity index 100%
rename from nni/retiarii/graph.py
rename to nni/nas/execution/common/graph.py
diff --git a/nni/retiarii/operation.py b/nni/nas/execution/common/graph_op.py
similarity index 100%
rename from nni/retiarii/operation.py
rename to nni/nas/execution/common/graph_op.py
diff --git a/nni/retiarii/integration.py b/nni/nas/execution/common/integration.py
similarity index 100%
rename from nni/retiarii/integration.py
rename to nni/nas/execution/common/integration.py
diff --git a/nni/retiarii/integration_api.py b/nni/nas/execution/common/integration_api.py
similarity index 100%
rename from nni/retiarii/integration_api.py
rename to nni/nas/execution/common/integration_api.py
diff --git a/nni/retiarii/execution/listener.py b/nni/nas/execution/common/listener.py
similarity index 100%
rename from nni/retiarii/execution/listener.py
rename to nni/nas/execution/common/listener.py
diff --git a/nni/retiarii/execution/utils.py b/nni/nas/execution/common/utils.py
similarity index 100%
rename from nni/retiarii/execution/utils.py
rename to nni/nas/execution/common/utils.py
diff --git a/nni/retiarii/execution/benchmark.py b/nni/nas/execution/pytorch/benchmark.py
similarity index 100%
rename from nni/retiarii/execution/benchmark.py
rename to nni/nas/execution/pytorch/benchmark.py
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/nas/execution/pytorch/cgo.py
similarity index 100%
rename from nni/retiarii/execution/cgo_engine.py
rename to nni/nas/execution/pytorch/cgo.py
diff --git a/nni/retiarii/execution/logical_optimizer/interface.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/interface.py
similarity index 100%
rename from nni/retiarii/execution/logical_optimizer/interface.py
rename to nni/nas/execution/pytorch/cgo/logical_optimizer/interface.py
diff --git a/nni/retiarii/execution/logical_optimizer/logical_plan.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/logical_plan.py
similarity index 100%
rename from nni/retiarii/execution/logical_optimizer/logical_plan.py
rename to nni/nas/execution/pytorch/cgo/logical_optimizer/logical_plan.py
diff --git a/nni/retiarii/execution/logical_optimizer/opt_dedup_input.py b/nni/nas/execution/pytorch/cgo/logical_optimizer/opt_dedup_input.py
similarity index 100%
rename from nni/retiarii/execution/logical_optimizer/opt_dedup_input.py
rename to nni/nas/execution/pytorch/cgo/logical_optimizer/opt_dedup_input.py
diff --git a/nni/retiarii/codegen/pytorch.py b/nni/nas/execution/pytorch/codegen.py
similarity index 100%
rename from nni/retiarii/codegen/pytorch.py
rename to nni/nas/execution/pytorch/codegen.py
diff --git a/nni/retiarii/converter/graph_gen.py b/nni/nas/execution/pytorch/converter/graph_gen.py
similarity index 100%
rename from nni/retiarii/converter/graph_gen.py
rename to nni/nas/execution/pytorch/converter/graph_gen.py
diff --git a/nni/retiarii/converter/op_types.py b/nni/nas/execution/pytorch/converter/op_types.py
similarity index 100%
rename from nni/retiarii/converter/op_types.py
rename to nni/nas/execution/pytorch/converter/op_types.py
diff --git a/nni/retiarii/converter/utils.py b/nni/nas/execution/pytorch/converter/utils.py
similarity index 100%
rename from nni/retiarii/converter/utils.py
rename to nni/nas/execution/pytorch/converter/utils.py
diff --git a/nni/retiarii/converter/visualize.py b/nni/nas/execution/pytorch/converter/visualize.py
similarity index 100%
rename from nni/retiarii/converter/visualize.py
rename to nni/nas/execution/pytorch/converter/visualize.py
diff --git a/nni/retiarii/execution/base.py b/nni/nas/execution/pytorch/graph.py
similarity index 100%
rename from nni/retiarii/execution/base.py
rename to nni/nas/execution/pytorch/graph.py
diff --git a/nni/retiarii/operation_def/torch_op_def.py b/nni/nas/execution/pytorch/op_def.py
similarity index 100%
rename from nni/retiarii/operation_def/torch_op_def.py
rename to nni/nas/execution/pytorch/op_def.py
diff --git a/nni/retiarii/execution/python.py b/nni/nas/execution/pytorch/simplified.py
similarity index 100%
rename from nni/retiarii/execution/python.py
rename to nni/nas/execution/pytorch/simplified.py
diff --git a/nni/retiarii/operation_def/tf_op_def.py b/nni/nas/execution/tensorflow/op_def.py
similarity index 100%
rename from nni/retiarii/operation_def/tf_op_def.py
rename to nni/nas/execution/tensorflow/op_def.py
diff --git a/nni/retiarii/trial_entry.py b/nni/nas/execution/trial_entry.py
similarity index 100%
rename from nni/retiarii/trial_entry.py
rename to nni/nas/execution/trial_entry.py
diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/nas/experiment/config/engine_config.py
similarity index 100%
rename from nni/retiarii/experiment/config/engine_config.py
rename to nni/nas/experiment/config/engine_config.py
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py
similarity index 100%
rename from nni/retiarii/experiment/config/experiment_config.py
rename to nni/nas/experiment/config/experiment_config.py
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
similarity index 100%
rename from nni/retiarii/experiment/pytorch.py
rename to nni/nas/experiment/pytorch.py
diff --git a/nni/retiarii/fixed.py b/nni/nas/fixed.py
similarity index 100%
rename from nni/retiarii/fixed.py
rename to nni/nas/fixed.py
diff --git a/nni/retiarii/hub/pytorch/autoformer.py b/nni/nas/hub/pytorch/autoformer.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/autoformer.py
rename to nni/nas/hub/pytorch/autoformer.py
diff --git a/nni/retiarii/hub/pytorch/mobilenetv3.py b/nni/nas/hub/pytorch/mobilenetv3.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/mobilenetv3.py
rename to nni/nas/hub/pytorch/mobilenetv3.py
diff --git a/nni/retiarii/nn/pytorch/hypermodule.py b/nni/nas/hub/pytorch/modules/autoactivation.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/hypermodule.py
rename to nni/nas/hub/pytorch/modules/autoactivation.py
diff --git a/nni/retiarii/nn/pytorch/nasbench101.py b/nni/nas/hub/pytorch/modules/nasbench101.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/nasbench101.py
rename to nni/nas/hub/pytorch/modules/nasbench101.py
diff --git a/nni/nas/hub/pytorch/modules/nasbench201.py b/nni/nas/hub/pytorch/modules/nasbench201.py
new file mode 100644
index 000000000..dc9411fdb
--- /dev/null
+++ b/nni/nas/hub/pytorch/modules/nasbench201.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+__all__ = ['NasBench201Cell']
+
+from collections import OrderedDict
+from typing import Callable, List, Dict, Union, Optional
+
+import torch
+import torch.nn as nn
+
+from nni.nas.nn.pytorch import LayerChoice
+from nni.nas.nn.pytorch.mutation_utils import generate_new_label
+
+
+class NasBench201Cell(nn.Module):
+    """
+    Cell structure that is proposed in NAS-Bench-201.
+
+    Proposed by `NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search <https://arxiv.org/abs/2001.00326>`__.
+
+    This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor.
+    For every i < j, there is an edge from i-th node to j-th node.
+    Each edge in this DAG is associated with an operation transforming the hidden state from the source node
+    to the target node. All possible operations are selected from a predefined operation set, defined in ``op_candidates``.
+    Each of the ``op_candidates`` should be a callable that accepts input dimension and output dimension,
+    and returns a ``Module``.
+
+    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. For example,
+
+    The space size of this cell would be :math:`|op|^{N(N-1)/2}`, where :math:`|op|` is the number of operation candidates,
+    and :math:`N` is defined by ``num_tensors``.
+
+    Parameters
+    ----------
+    op_candidates : list of callable
+        Operation candidates. Each should be a function accepts input feature and output feature, returning nn.Module.
+    in_features : int
+        Input dimension of cell.
+    out_features : int
+        Output dimension of cell.
+    num_tensors : int
+        Number of tensors in the cell (input included). Default: 4
+    label : str
+        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
+    """
+
+    @staticmethod
+    def _make_dict(x):
+        if isinstance(x, list):
+            return OrderedDict([(str(i), t) for i, t in enumerate(x)])
+        return OrderedDict(x)
+
+    def __init__(self, op_candidates: Union[Dict[str, Callable[[int, int], nn.Module]], List[Callable[[int, int], nn.Module]]],
+                 in_features: int, out_features: int, num_tensors: int = 4,
+                 label: Optional[str] = None):
+        super().__init__()
+        self._label = generate_new_label(label)
+
+        self.layers = nn.ModuleList()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.num_tensors = num_tensors
+
+        op_candidates = self._make_dict(op_candidates)
+
+        for tid in range(1, num_tensors):
+            node_ops = nn.ModuleList()
+            for j in range(tid):
+                inp = in_features if j == 0 else out_features
+                op_choices = OrderedDict([(key, cls(inp, out_features))
+                                          for key, cls in op_candidates.items()])
+                node_ops.append(LayerChoice(op_choices, label=f'{self._label}__{j}_{tid}'))  # put __ here to be compatible with base engine
+            self.layers.append(node_ops)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        The forward of input choice is simply selecting first on all choices.
+        It shouldn't be called directly by users in most cases.
+        """
+        tensors: List[torch.Tensor] = [inputs]
+        for layer in self.layers:
+            current_tensor: List[torch.Tensor] = []
+            for i, op in enumerate(layer):  # type: ignore
+                current_tensor.append(op(tensors[i]))  # type: ignore
+            tensors.append(torch.sum(torch.stack(current_tensor), 0))
+        return tensors[-1]
diff --git a/nni/retiarii/hub/pytorch/nasbench101.py b/nni/nas/hub/pytorch/nasbench101.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/nasbench101.py
rename to nni/nas/hub/pytorch/nasbench101.py
diff --git a/nni/retiarii/hub/pytorch/nasbench201.py b/nni/nas/hub/pytorch/nasbench201.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/nasbench201.py
rename to nni/nas/hub/pytorch/nasbench201.py
diff --git a/nni/retiarii/hub/pytorch/nasnet.py b/nni/nas/hub/pytorch/nasnet.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/nasnet.py
rename to nni/nas/hub/pytorch/nasnet.py
diff --git a/nni/retiarii/hub/pytorch/proxylessnas.py b/nni/nas/hub/pytorch/proxylessnas.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/proxylessnas.py
rename to nni/nas/hub/pytorch/proxylessnas.py
diff --git a/nni/retiarii/hub/pytorch/shufflenet.py b/nni/nas/hub/pytorch/shufflenet.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/shufflenet.py
rename to nni/nas/hub/pytorch/shufflenet.py
diff --git a/nni/retiarii/hub/pytorch/utils/fixed.py b/nni/nas/hub/pytorch/utils/fixed.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/utils/fixed.py
rename to nni/nas/hub/pytorch/utils/fixed.py
diff --git a/nni/retiarii/hub/pytorch/utils/pretrained.py b/nni/nas/hub/pytorch/utils/pretrained.py
similarity index 100%
rename from nni/retiarii/hub/pytorch/utils/pretrained.py
rename to nni/nas/hub/pytorch/utils/pretrained.py
diff --git a/nni/retiarii/mutator.py b/nni/nas/mutable/mutator.py
similarity index 100%
rename from nni/retiarii/mutator.py
rename to nni/nas/mutable/mutator.py
diff --git a/nni/retiarii/nn/pytorch/cell.py b/nni/nas/nn/pytorch/cell.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/cell.py
rename to nni/nas/nn/pytorch/cell.py
diff --git a/nni/retiarii/nn/pytorch/api.py b/nni/nas/nn/pytorch/choice.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/api.py
rename to nni/nas/nn/pytorch/choice.py
diff --git a/nni/retiarii/nn/pytorch/nn.py b/nni/nas/nn/pytorch/layers.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/nn.py
rename to nni/nas/nn/pytorch/layers.py
diff --git a/nni/retiarii/nn/pytorch/mutation_utils.py b/nni/nas/nn/pytorch/mutation_utils.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/mutation_utils.py
rename to nni/nas/nn/pytorch/mutation_utils.py
diff --git a/nni/retiarii/nn/pytorch/mutator.py b/nni/nas/nn/pytorch/mutator.py
similarity index 100%
rename from nni/retiarii/nn/pytorch/mutator.py
rename to nni/nas/nn/pytorch/mutator.py
diff --git a/nni/retiarii/nn/pytorch/component.py b/nni/nas/nn/pytorch/repeat.py
similarity index 61%
rename from nni/retiarii/nn/pytorch/component.py
rename to nni/nas/nn/pytorch/repeat.py
index 3340116f4..6168d947a 100644
--- a/nni/retiarii/nn/pytorch/component.py
+++ b/nni/nas/nn/pytorch/repeat.py
@@ -3,21 +3,17 @@
 
 import copy
 import warnings
-from collections import OrderedDict
-from typing import Callable, List, Dict, Union, Tuple, Optional
+from typing import Callable, List, Union, Tuple, Optional
 
-import torch
 import torch.nn as nn
 
-from nni.retiarii.utils import NoContextError, STATE_DICT_PY_MAPPING_PARTIAL
+from nni.nas.utils import NoContextError, STATE_DICT_PY_MAPPING_PARTIAL
 
-from .api import LayerChoice, ValueChoice, ValueChoiceX, ChoiceOf
-from .cell import Cell
-from .nasbench101 import NasBench101Cell, NasBench101Mutator
-from .mutation_utils import Mutable, generate_new_label, get_fixed_value
+from .choice import ValueChoice, ValueChoiceX, ChoiceOf
+from .mutation_utils import Mutable, get_fixed_value
 
 
-__all__ = ['Repeat', 'Cell', 'NasBench101Cell', 'NasBench101Mutator', 'NasBench201Cell']
+__all__ = ['Repeat']
 
 
 class Repeat(Mutable):
@@ -159,77 +155,3 @@ class Repeat(Mutable):
 
     def __len__(self):
         return self.max_depth
-
-
-class NasBench201Cell(nn.Module):
-    """
-    Cell structure that is proposed in NAS-Bench-201.
-
-    Proposed by `NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search <https://arxiv.org/abs/2001.00326>`__.
-
-    This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor.
-    For every i < j, there is an edge from i-th node to j-th node.
-    Each edge in this DAG is associated with an operation transforming the hidden state from the source node
-    to the target node. All possible operations are selected from a predefined operation set, defined in ``op_candidates``.
-    Each of the ``op_candidates`` should be a callable that accepts input dimension and output dimension,
-    and returns a ``Module``.
-
-    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. For example,
-
-    The space size of this cell would be :math:`|op|^{N(N-1)/2}`, where :math:`|op|` is the number of operation candidates,
-    and :math:`N` is defined by ``num_tensors``.
-
-    Parameters
-    ----------
-    op_candidates : list of callable
-        Operation candidates. Each should be a function accepts input feature and output feature, returning nn.Module.
-    in_features : int
-        Input dimension of cell.
-    out_features : int
-        Output dimension of cell.
-    num_tensors : int
-        Number of tensors in the cell (input included). Default: 4
-    label : str
-        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-    """
-
-    @staticmethod
-    def _make_dict(x):
-        if isinstance(x, list):
-            return OrderedDict([(str(i), t) for i, t in enumerate(x)])
-        return OrderedDict(x)
-
-    def __init__(self, op_candidates: Union[Dict[str, Callable[[int, int], nn.Module]], List[Callable[[int, int], nn.Module]]],
-                 in_features: int, out_features: int, num_tensors: int = 4,
-                 label: Optional[str] = None):
-        super().__init__()
-        self._label = generate_new_label(label)
-
-        self.layers = nn.ModuleList()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.num_tensors = num_tensors
-
-        op_candidates = self._make_dict(op_candidates)
-
-        for tid in range(1, num_tensors):
-            node_ops = nn.ModuleList()
-            for j in range(tid):
-                inp = in_features if j == 0 else out_features
-                op_choices = OrderedDict([(key, cls(inp, out_features))
-                                          for key, cls in op_candidates.items()])
-                node_ops.append(LayerChoice(op_choices, label=f'{self._label}__{j}_{tid}'))  # put __ here to be compatible with base engine
-            self.layers.append(node_ops)
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        """
-        The forward of input choice is simply selecting first on all choices.
-        It shouldn't be called directly by users in most cases.
-        """
-        tensors: List[torch.Tensor] = [inputs]
-        for layer in self.layers:
-            current_tensor: List[torch.Tensor] = []
-            for i, op in enumerate(layer):  # type: ignore
-                current_tensor.append(op(tensors[i]))  # type: ignore
-            tensors.append(torch.sum(torch.stack(current_tensor), 0))
-        return tensors[-1]
diff --git a/nni/retiarii/nn/tensorflow/api.py b/nni/nas/nn/tensorflow/api.py
similarity index 100%
rename from nni/retiarii/nn/tensorflow/api.py
rename to nni/nas/nn/tensorflow/api.py
diff --git a/nni/retiarii/oneshot/pytorch/base_lightning.py b/nni/nas/oneshot/pytorch/base_lightning.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/base_lightning.py
rename to nni/nas/oneshot/pytorch/base_lightning.py
diff --git a/nni/retiarii/oneshot/pytorch/dataloader.py b/nni/nas/oneshot/pytorch/dataloader.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/dataloader.py
rename to nni/nas/oneshot/pytorch/dataloader.py
diff --git a/nni/retiarii/oneshot/pytorch/differentiable.py b/nni/nas/oneshot/pytorch/differentiable.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/differentiable.py
rename to nni/nas/oneshot/pytorch/differentiable.py
diff --git a/nni/nas/oneshot/pytorch/enas.py b/nni/nas/oneshot/pytorch/enas.py
new file mode 100644
index 000000000..7398e4072
--- /dev/null
+++ b/nni/nas/oneshot/pytorch/enas.py
@@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import cast
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class StackedLSTMCell(nn.Module):
+    def __init__(self, layers, size, bias):
+        super().__init__()
+        self.lstm_num_layers = layers
+        self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias)
+                                           for _ in range(self.lstm_num_layers)])
+
+    def forward(self, inputs, hidden):
+        prev_h, prev_c = hidden
+        next_h, next_c = [], []
+        for i, m in enumerate(self.lstm_modules):
+            curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i]))
+            next_c.append(curr_c)
+            next_h.append(curr_h)
+            # current implementation only supports batch size equals 1,
+            # but the algorithm does not necessarily have this limitation
+            inputs = curr_h[-1].view(1, -1)
+        return next_h, next_c
+
+
+class ReinforceField:
+    """
+    A field with ``name``, with ``total`` choices. ``choose_one`` is true if one and only one is meant to be
+    selected. Otherwise, any number of choices can be chosen.
+    """
+
+    def __init__(self, name, total, choose_one):
+        self.name = name
+        self.total = total
+        self.choose_one = choose_one
+
+    def __repr__(self):
+        return f'ReinforceField(name={self.name}, total={self.total}, choose_one={self.choose_one})'
+
+
+class ReinforceController(nn.Module):
+    """
+    A controller that mutates the graph with RL.
+
+    Parameters
+    ----------
+    fields : list of ReinforceField
+        List of fields to choose.
+    lstm_size : int
+        Controller LSTM hidden units.
+    lstm_num_layers : int
+        Number of layers for stacked LSTM.
+    tanh_constant : float
+        Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``.
+    skip_target : float
+        Target probability that skipconnect (chosen by InputChoice) will appear.
+        If the chosen number of inputs is away from the ``skip_connect``, there will be
+        a sample skip penalty which is a KL divergence added.
+    temperature : float
+        Temperature constant that divides the logits.
+    entropy_reduction : str
+        Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced.
+    """
+
+    def __init__(self, fields, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5,
+                 skip_target=0.4, temperature=None, entropy_reduction='sum'):
+        super(ReinforceController, self).__init__()
+        self.fields = fields
+        self.lstm_size = lstm_size
+        self.lstm_num_layers = lstm_num_layers
+        self.tanh_constant = tanh_constant
+        self.temperature = temperature
+        self.skip_target = skip_target
+
+        self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False)
+        self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
+        self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
+        self.v_attn = nn.Linear(self.lstm_size, 1, bias=False)
+        self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1)
+        self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]),  # pylint: disable=not-callable
+                                         requires_grad=False)
+        assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.'
+        self.entropy_reduction = torch.sum if entropy_reduction == 'sum' else torch.mean
+        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
+        self.soft = nn.ModuleDict({
+            field.name: nn.Linear(self.lstm_size, field.total, bias=False) for field in fields
+        })
+        self.embedding = nn.ModuleDict({
+            field.name: nn.Embedding(field.total, self.lstm_size) for field in fields
+        })
+
+    def resample(self):
+        self._initialize()
+        result = dict()
+        for field in self.fields:
+            result[field.name] = self._sample_single(field)
+        return result
+
+    def _initialize(self):
+        self._inputs = self.g_emb.data
+        self._c = [torch.zeros((1, self.lstm_size),
+                               dtype=self._inputs.dtype,
+                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
+        self._h = [torch.zeros((1, self.lstm_size),
+                               dtype=self._inputs.dtype,
+                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
+        self.sample_log_prob: torch.Tensor = cast(torch.Tensor, 0)
+        self.sample_entropy: torch.Tensor = cast(torch.Tensor, 0)
+        self.sample_skip_penalty: torch.Tensor = cast(torch.Tensor, 0)
+
+    def _lstm_next_step(self):
+        self._h, self._c = self.lstm(self._inputs, (self._h, self._c))
+
+    def _sample_single(self, field):
+        self._lstm_next_step()
+        logit = self.soft[field.name](self._h[-1])
+        if self.temperature is not None:
+            logit /= self.temperature
+        if self.tanh_constant is not None:
+            logit = self.tanh_constant * torch.tanh(logit)
+        if field.choose_one:
+            sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
+            log_prob = self.cross_entropy_loss(logit, sampled)
+            self._inputs = self.embedding[field.name](sampled)
+        else:
+            logit = logit.view(-1, 1)
+            logit = torch.cat([-logit, logit], 1)  # pylint: disable=invalid-unary-operand-type
+            sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
+            skip_prob = torch.sigmoid(logit)
+            kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets))
+            self.sample_skip_penalty += kl
+            log_prob = self.cross_entropy_loss(logit, sampled)
+            sampled = sampled.nonzero().view(-1)
+            if sampled.sum().item():
+                self._inputs = (torch.sum(self.embedding[field.name](sampled.view(-1)), 0) / (1. + torch.sum(sampled))).unsqueeze(0)
+            else:
+                self._inputs = torch.zeros(1, self.lstm_size, device=self.embedding[field.name].weight.device)  # type: ignore
+
+        sampled = sampled.detach().cpu().numpy().tolist()
+        self.sample_log_prob += self.entropy_reduction(log_prob)
+        entropy = (log_prob * torch.exp(-log_prob)).detach()  # pylint: disable=invalid-unary-operand-type
+        self.sample_entropy += self.entropy_reduction(entropy)
+        if len(sampled) == 1:
+            sampled = sampled[0]
+        return sampled
diff --git a/nni/retiarii/oneshot/pytorch/sampling.py b/nni/nas/oneshot/pytorch/sampling.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/sampling.py
rename to nni/nas/oneshot/pytorch/sampling.py
diff --git a/nni/retiarii/oneshot/pytorch/strategy.py b/nni/nas/oneshot/pytorch/strategy.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/strategy.py
rename to nni/nas/oneshot/pytorch/strategy.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py b/nni/nas/oneshot/pytorch/supermodule/_operation_utils.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py
rename to nni/nas/oneshot/pytorch/supermodule/_operation_utils.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py b/nni/nas/oneshot/pytorch/supermodule/_singlepathnas.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py
rename to nni/nas/oneshot/pytorch/supermodule/_singlepathnas.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py b/nni/nas/oneshot/pytorch/supermodule/_valuechoice_utils.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py
rename to nni/nas/oneshot/pytorch/supermodule/_valuechoice_utils.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/base.py b/nni/nas/oneshot/pytorch/supermodule/base.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/base.py
rename to nni/nas/oneshot/pytorch/supermodule/base.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/differentiable.py b/nni/nas/oneshot/pytorch/supermodule/differentiable.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/differentiable.py
rename to nni/nas/oneshot/pytorch/supermodule/differentiable.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/operation.py b/nni/nas/oneshot/pytorch/supermodule/operation.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/operation.py
rename to nni/nas/oneshot/pytorch/supermodule/operation.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/proxyless.py b/nni/nas/oneshot/pytorch/supermodule/proxyless.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/proxyless.py
rename to nni/nas/oneshot/pytorch/supermodule/proxyless.py
diff --git a/nni/retiarii/oneshot/pytorch/supermodule/sampling.py b/nni/nas/oneshot/pytorch/supermodule/sampling.py
similarity index 100%
rename from nni/retiarii/oneshot/pytorch/supermodule/sampling.py
rename to nni/nas/oneshot/pytorch/supermodule/sampling.py
diff --git a/nni/nas/pytorch/__init__.py b/nni/nas/pytorch/__init__.py
deleted file mode 100644
index 927935baf..000000000
--- a/nni/nas/pytorch/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .base_mutator import BaseMutator
-from .base_trainer import BaseTrainer
-from .fixed import apply_fixed_architecture
-from .mutables import Mutable, LayerChoice, InputChoice
-from .mutator import Mutator
-from .trainer import Trainer
diff --git a/nni/nas/pytorch/base_mutator.py b/nni/nas/pytorch/base_mutator.py
deleted file mode 100644
index df1a5f9ba..000000000
--- a/nni/nas/pytorch/base_mutator.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import torch.nn as nn
-from nni.nas.pytorch.mutables import Mutable, MutableScope, InputChoice
-from nni.nas.pytorch.utils import StructuredMutableTreeNode
-
-logger = logging.getLogger(__name__)
-
-
-class BaseMutator(nn.Module):
-    """
-    A mutator is responsible for mutating a graph by obtaining the search space from the network and implementing
-    callbacks that are called in ``forward`` in mutables.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model to apply mutator on.
-    """
-
-    def __init__(self, model):
-        super().__init__()
-        self.__dict__["model"] = model
-        self._structured_mutables = self._parse_search_space(self.model)
-
-    def _parse_search_space(self, module, root=None, prefix="", memo=None, nested_detection=None):
-        if memo is None:
-            memo = set()
-        if root is None:
-            root = StructuredMutableTreeNode(None)
-        if module not in memo:
-            memo.add(module)
-            if isinstance(module, Mutable):
-                if nested_detection is not None:
-                    raise RuntimeError("Cannot have nested search space. Error at {} in {}"
-                                       .format(module, nested_detection))
-                module.name = prefix
-                module.set_mutator(self)
-                root = root.add_child(module)
-                if not isinstance(module, MutableScope):
-                    nested_detection = module
-                if isinstance(module, InputChoice):
-                    for k in module.choose_from:
-                        if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]:
-                            raise RuntimeError("'{}' required by '{}' not found in keys that appeared before, and is not NO_KEY."
-                                               .format(k, module.key))
-            for name, submodule in module._modules.items():
-                if submodule is None:
-                    continue
-                submodule_prefix = prefix + ("." if prefix else "") + name
-                self._parse_search_space(submodule, root, submodule_prefix, memo=memo,
-                                         nested_detection=nested_detection)
-        return root
-
-    @property
-    def mutables(self):
-        """
-        A generator of all modules inheriting :class:`~nni.nas.pytorch.mutables.Mutable`.
-        Modules are yielded in the order that they are defined in ``__init__``.
-        For mutables with their keys appearing multiple times, only the first one will appear.
-        """
-        return self._structured_mutables
-
-    @property
-    def undedup_mutables(self):
-        return self._structured_mutables.traverse(deduplicate=False)
-
-    def forward(self, *inputs):
-        """
-        Warnings
-        --------
-        Don't call forward of a mutator.
-        """
-        raise RuntimeError("Forward is undefined for mutators.")
-
-    def __setattr__(self, name, value):
-        if name == "model":
-            raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to "
-                                 "include you network, as it will include all parameters in model into the mutator.")
-        return super().__setattr__(name, value)
-
-    def enter_mutable_scope(self, mutable_scope):
-        """
-        Callback when forward of a MutableScope is entered.
-
-        Parameters
-        ----------
-        mutable_scope : MutableScope
-            The mutable scope that is entered.
-        """
-        pass
-
-    def exit_mutable_scope(self, mutable_scope):
-        """
-        Callback when forward of a MutableScope is exited.
-
-        Parameters
-        ----------
-        mutable_scope : MutableScope
-            The mutable scope that is exited.
-        """
-        pass
-
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        Callbacks of forward in LayerChoice.
-
-        Parameters
-        ----------
-        mutable : nni.nas.pytorch.mutables.LayerChoice
-            Module whose forward is called.
-        args : list of torch.Tensor
-            The arguments of its forward function.
-        kwargs : dict
-            The keyword arguments of its forward function.
-
-        Returns
-        -------
-        tuple of torch.Tensor and torch.Tensor
-            Output tensor and mask.
-        """
-        raise NotImplementedError
-
-    def on_forward_input_choice(self, mutable, tensor_list):
-        """
-        Callbacks of forward in InputChoice.
-
-        Parameters
-        ----------
-        mutable : nni.nas.pytorch.mutables.InputChoice
-            Mutable that is called.
-        tensor_list : list of torch.Tensor
-            The arguments mutable is called with.
-
-        Returns
-        -------
-        tuple of torch.Tensor and torch.Tensor
-            Output tensor and mask.
-        """
-        raise NotImplementedError
-
-    def export(self):
-        """
-        Export the data of all decisions. This should output the decisions of all the mutables, so that the whole
-        network can be fully determined with these decisions for further training from scratch.
-
-        Returns
-        -------
-        dict
-            Mappings from mutable keys to decisions.
-        """
-        raise NotImplementedError
diff --git a/nni/nas/pytorch/base_trainer.py b/nni/nas/pytorch/base_trainer.py
deleted file mode 100644
index 2e7a4a2a2..000000000
--- a/nni/nas/pytorch/base_trainer.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from abc import ABC, abstractmethod
-
-
-class BaseTrainer(ABC):
-
-    @abstractmethod
-    def train(self):
-        """
-        Override the method to train.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def validate(self):
-        """
-        Override the method to validate.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def export(self, file):
-        """
-        Override the method to export to file.
-
-        Parameters
-        ----------
-        file : str
-            File path to export to.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def checkpoint(self):
-        """
-        Override to dump a checkpoint.
-        """
-        raise NotImplementedError
diff --git a/nni/nas/pytorch/callbacks.py b/nni/nas/pytorch/callbacks.py
deleted file mode 100644
index 86a0dc380..000000000
--- a/nni/nas/pytorch/callbacks.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-import os
-
-import torch
-import torch.nn as nn
-
-_logger = logging.getLogger(__name__)
-
-
-class Callback:
-    """
-    Callback provides an easy way to react to events like begin/end of epochs.
-    """
-
-    def __init__(self):
-        self.model = None
-        self.mutator = None
-        self.trainer = None
-
-    def build(self, model, mutator, trainer):
-        """
-        Callback needs to be built with model, mutator, trainer, to get updates from them.
-
-        Parameters
-        ----------
-        model : nn.Module
-            Model to be trained.
-        mutator : nn.Module
-            Mutator that mutates the model.
-        trainer : BaseTrainer
-            Trainer that is to call the callback.
-        """
-        self.model = model
-        self.mutator = mutator
-        self.trainer = trainer
-
-    def on_epoch_begin(self, epoch):
-        """
-        Implement this to do something at the begin of epoch.
-
-        Parameters
-        ----------
-        epoch : int
-            Epoch number, starting from 0.
-        """
-        pass
-
-    def on_epoch_end(self, epoch):
-        """
-        Implement this to do something at the end of epoch.
-
-        Parameters
-        ----------
-        epoch : int
-            Epoch number, starting from 0.
-        """
-        pass
-
-    def on_batch_begin(self, epoch):
-        pass
-
-    def on_batch_end(self, epoch):
-        pass
-
-
-class LRSchedulerCallback(Callback):
-    """
-    Calls scheduler on every epoch ends.
-
-    Parameters
-    ----------
-    scheduler : LRScheduler
-        Scheduler to be called.
-    """
-    def __init__(self, scheduler, mode="epoch"):
-        super().__init__()
-        assert mode == "epoch"
-        self.scheduler = scheduler
-        self.mode = mode
-
-    def on_epoch_end(self, epoch):
-        """
-        Call ``self.scheduler.step()`` on epoch end.
-        """
-        self.scheduler.step()
-
-
-class ArchitectureCheckpoint(Callback):
-    """
-    Calls ``trainer.export()`` on every epoch ends.
-
-    Parameters
-    ----------
-    checkpoint_dir : str
-        Location to save checkpoints.
-    """
-    def __init__(self, checkpoint_dir):
-        super().__init__()
-        self.checkpoint_dir = checkpoint_dir
-        os.makedirs(self.checkpoint_dir, exist_ok=True)
-
-    def on_epoch_end(self, epoch):
-        """
-        Dump to ``/checkpoint_dir/epoch_{number}.json`` on epoch end.
-        """
-        dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch))
-        _logger.info("Saving architecture to %s", dest_path)
-        self.trainer.export(dest_path)
-
-
-class ModelCheckpoint(Callback):
-    """
-    Calls ``trainer.export()`` on every epoch ends.
-
-    Parameters
-    ----------
-    checkpoint_dir : str
-        Location to save checkpoints.
-    """
-    def __init__(self, checkpoint_dir):
-        super().__init__()
-        self.checkpoint_dir = checkpoint_dir
-        os.makedirs(self.checkpoint_dir, exist_ok=True)
-
-    def on_epoch_end(self, epoch):
-        """
-        Dump to ``/checkpoint_dir/epoch_{number}.pth.tar`` on every epoch end.
-        ``DataParallel`` object will have their inside modules exported.
-        """
-        if isinstance(self.model, nn.DataParallel):
-            state_dict = self.model.module.state_dict()
-        else:
-            state_dict = self.model.state_dict()
-        dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.pth.tar".format(epoch))
-        _logger.info("Saving model to %s", dest_path)
-        torch.save(state_dict, dest_path)
diff --git a/nni/nas/pytorch/fixed.py b/nni/nas/pytorch/fixed.py
deleted file mode 100644
index 9bfa933e8..000000000
--- a/nni/nas/pytorch/fixed.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-
-from .mutables import InputChoice, LayerChoice, MutableScope
-from .mutator import Mutator
-from .utils import to_list
-
-
-_logger = logging.getLogger(__name__)
-
-
-class FixedArchitecture(Mutator):
-    """
-    Fixed architecture mutator that always selects a certain graph.
-
-    Parameters
-    ----------
-    model : nn.Module
-        A mutable network.
-    fixed_arc : dict
-        Preloaded architecture object.
-    strict : bool
-        Force everything that appears in ``fixed_arc`` to be used at least once.
-    verbose : bool
-        Print log messages if set to True
-    """
-
-    def __init__(self, model, fixed_arc, strict=True, verbose=True):
-        super().__init__(model)
-        self._fixed_arc = fixed_arc
-        self.verbose = verbose
-
-        mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)])
-        fixed_arc_keys = set(self._fixed_arc.keys())
-        if fixed_arc_keys - mutable_keys:
-            raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys))
-        if mutable_keys - fixed_arc_keys:
-            raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys))
-        self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc)
-
-    def _from_human_readable_architecture(self, human_arc):
-        # convert from an exported architecture
-        result_arc = {k: to_list(v) for k, v in human_arc.items()}  # there could be tensors, numpy arrays, etc.
-        # First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"},
-        # which means {"op1": [0, ]} ir {"op1": ["conv", ]}
-        result_arc = {k: v if isinstance(v, list) else [v] for k, v in result_arc.items()}
-        # Second, infer which ones are multi-hot arrays and which ones are in human-readable format.
-        # This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true].
-        # Here, we assume an multihot array has to be a boolean array or a float array and matches the length.
-        for mutable in self.mutables:
-            if mutable.key not in result_arc:
-                continue  # skip silently
-            choice_arr = result_arc[mutable.key]
-            if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr):
-                if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \
-                        (isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)):
-                    # multihot, do nothing
-                    continue
-            if isinstance(mutable, LayerChoice):
-                choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr]
-                choice_arr = [i in choice_arr for i in range(len(mutable))]
-            elif isinstance(mutable, InputChoice):
-                choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr]
-                choice_arr = [i in choice_arr for i in range(mutable.n_candidates)]
-            result_arc[mutable.key] = choice_arr
-        return result_arc
-
-    def sample_search(self):
-        """
-        Always returns the fixed architecture.
-        """
-        return self._fixed_arc
-
-    def sample_final(self):
-        """
-        Always returns the fixed architecture.
-        """
-        return self._fixed_arc
-
-    def replace_layer_choice(self, module=None, prefix=""):
-        """
-        Replace layer choices with selected candidates. It's done with best effort.
-        In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them.
-        If single choice, replace the module with a normal module.
-
-        Parameters
-        ----------
-        module : nn.Module
-            Module to be processed.
-        prefix : str
-            Module name under global namespace.
-        """
-        if module is None:
-            module = self.model
-        for name, mutable in module.named_children():
-            global_name = (prefix + "." if prefix else "") + name
-            if isinstance(mutable, LayerChoice):
-                chosen = self._fixed_arc[mutable.key]
-                if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask:
-                    # sum is one, max is one, there has to be an only one
-                    # this is compatible with both integer arrays, boolean arrays and float arrays
-                    if self.verbose:
-                        _logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1))
-                    setattr(module, name, mutable[chosen.index(1)])
-                else:
-                    if mutable.return_mask and self.verbose:
-                        _logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \
-                                     "LayerChoice will not be replaced.")
-                    # remove unused parameters
-                    for ch, n in zip(chosen, mutable.names):
-                        if ch == 0 and not isinstance(ch, float):
-                            setattr(mutable, n, None)
-            else:
-                self.replace_layer_choice(mutable, global_name)
-
-
-def apply_fixed_architecture(model, fixed_arc, verbose=True):
-    """
-    Load architecture from `fixed_arc` and apply to model.
-
-    Parameters
-    ----------
-    model : torch.nn.Module
-        Model with mutables.
-    fixed_arc : str or dict
-        Path to the JSON that stores the architecture, or dict that stores the exported architecture.
-    verbose : bool
-        Print log messages if set to True
-
-    Returns
-    -------
-    FixedArchitecture
-        Mutator that is responsible for fixes the graph.
-    """
-
-    if isinstance(fixed_arc, str):
-        with open(fixed_arc) as f:
-            fixed_arc = json.load(f)
-    architecture = FixedArchitecture(model, fixed_arc, verbose)
-    architecture.reset()
-
-    # for the convenience of parameters counting
-    architecture.replace_layer_choice()
-    return architecture
diff --git a/nni/nas/pytorch/mutables.py b/nni/nas/pytorch/mutables.py
deleted file mode 100644
index 7fbb655e5..000000000
--- a/nni/nas/pytorch/mutables.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-import warnings
-from collections import OrderedDict
-
-import torch.nn as nn
-
-from nni.nas.pytorch.utils import global_mutable_counting
-
-logger = logging.getLogger(__name__)
-
-
-class Mutable(nn.Module):
-    """
-    Mutable is designed to function as a normal layer, with all necessary operators' weights.
-    States and weights of architectures should be included in mutator, instead of the layer itself.
-
-    Mutable has a key, which marks the identity of the mutable. This key can be used by users to share
-    decisions among different mutables. In mutator's implementation, mutators should use the key to
-    distinguish different mutables. Mutables that share the same key should be "similar" to each other.
-
-    Currently the default scope for keys is global. By default, the keys uses a global counter from 1 to
-    produce unique ids.
-
-    Parameters
-    ----------
-    key : str
-        The key of mutable.
-
-    Notes
-    -----
-    The counter is program level, but mutables are model level. In case multiple models are defined, and
-    you want to have `counter` starting from 1 in the second model, it's recommended to assign keys manually
-    instead of using automatic keys.
-    """
-
-    def __init__(self, key=None):
-        super().__init__()
-        if key is not None:
-            if not isinstance(key, str):
-                key = str(key)
-                logger.warning("Warning: key \"%s\" is not string, converted to string.", key)
-            self._key = key
-        else:
-            self._key = self.__class__.__name__ + str(global_mutable_counting())
-        self.init_hook = self.forward_hook = None
-
-    def __deepcopy__(self, memodict=None):
-        raise NotImplementedError("Deep copy doesn't work for mutables.")
-
-    def __call__(self, *args, **kwargs):
-        self._check_built()
-        return super().__call__(*args, **kwargs)
-
-    def set_mutator(self, mutator):
-        if "mutator" in self.__dict__:
-            raise RuntimeError("`set_mutator` is called more than once. Did you parse the search space multiple times? "
-                               "Or did you apply multiple fixed architectures?")
-        self.__dict__["mutator"] = mutator
-
-    @property
-    def key(self):
-        """
-        Read-only property of key.
-        """
-        return self._key
-
-    @property
-    def name(self):
-        """
-        After the search space is parsed, it will be the module name of the mutable.
-        """
-        return self._name if hasattr(self, "_name") else self._key
-
-    @name.setter
-    def name(self, name):
-        self._name = name
-
-    def _check_built(self):
-        if not hasattr(self, "mutator"):
-            raise ValueError(
-                "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. "
-                "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` "
-                "so that trainer can locate all your mutables. See NNI docs for more details.".format(self))
-
-
-class MutableScope(Mutable):
-    """
-    Mutable scope marks a subgraph/submodule to help mutators make better decisions.
-
-    If not annotated with mutable scope, search space will be flattened as a list. However, some mutators might
-    need to leverage the concept of a "cell". So if a module is defined as a mutable scope, everything in it will
-    look like "sub-search-space" in the scope. Scopes can be nested.
-
-    There are two ways mutators can use mutable scope. One is to traverse the search space as a tree during initialization
-    and reset. The other is to implement `enter_mutable_scope` and `exit_mutable_scope`. They are called before and after
-    the forward method of the class inheriting mutable scope.
-
-    Mutable scopes are also mutables that are listed in the mutator.mutables (search space), but they are not supposed
-    to appear in the dict of choices.
-
-    Parameters
-    ----------
-    key : str
-        Key of mutable scope.
-    """
-    def __init__(self, key):
-        super().__init__(key=key)
-
-    def _check_built(self):
-        return True  # bypass the test because it's deprecated
-
-    def __call__(self, *args, **kwargs):
-        if not hasattr(self, 'mutator'):
-            return super().__call__(*args, **kwargs)
-        warnings.warn("`MutableScope` is deprecated in Retiarii.", DeprecationWarning)
-        try:
-            self._check_built()
-            self.mutator.enter_mutable_scope(self)
-            return super().__call__(*args, **kwargs)
-        finally:
-            self.mutator.exit_mutable_scope(self)
-
-
-class LayerChoice(Mutable):
-    """
-    Layer choice selects one of the ``op_candidates``, then apply it on inputs and return results.
-    In rare cases, it can also select zero or many.
-
-    Layer choice does not allow itself to be nested.
-
-    Parameters
-    ----------
-    op_candidates : list of nn.Module or OrderedDict
-        A module list to be selected from.
-    reduction : str
-        ``mean``, ``concat``, ``sum`` or ``none``. Policy if multiples are selected.
-        If ``none``, a list is returned. ``mean`` returns the average. ``sum`` returns the sum.
-        ``concat`` concatenate the list at dimension 1.
-    return_mask : bool
-        If ``return_mask``, return output tensor and a mask. Otherwise return tensor only.
-    key : str
-        Key of the input choice.
-
-    Attributes
-    ----------
-    length : int
-        Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended.
-    names : list of str
-        Names of candidates.
-    choices : list of Module
-        Deprecated. A list of all candidate modules in the layer choice module.
-        ``list(layer_choice)`` is recommended, which will serve the same purpose.
-
-    Notes
-    -----
-    ``op_candidates`` can be a list of modules or a ordered dict of named modules, for example,
-
-    .. code-block:: python
-
-        self.op_choice = LayerChoice(OrderedDict([
-            ("conv3x3", nn.Conv2d(3, 16, 128)),
-            ("conv5x5", nn.Conv2d(5, 16, 128)),
-            ("conv7x7", nn.Conv2d(7, 16, 128))
-        ]))
-
-    Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or
-    ``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet.
-    """
-
-    def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None):
-        super().__init__(key=key)
-        self.names = []
-        if isinstance(op_candidates, OrderedDict):
-            for name, module in op_candidates.items():
-                assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \
-                    "Please don't use a reserved name '{}' for your module.".format(name)
-                self.add_module(name, module)
-                self.names.append(name)
-        elif isinstance(op_candidates, list):
-            for i, module in enumerate(op_candidates):
-                self.add_module(str(i), module)
-                self.names.append(str(i))
-        else:
-            raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates)))
-        self.reduction = reduction
-        self.return_mask = return_mask
-
-    def __getitem__(self, idx):
-        if isinstance(idx, str):
-            return self._modules[idx]
-        return list(self)[idx]
-
-    def __setitem__(self, idx, module):
-        key = idx if isinstance(idx, str) else self.names[idx]
-        return setattr(self, key, module)
-
-    def __delitem__(self, idx):
-        if isinstance(idx, slice):
-            for key in self.names[idx]:
-                delattr(self, key)
-        else:
-            if isinstance(idx, str):
-                key, idx = idx, self.names.index(idx)
-            else:
-                key = self.names[idx]
-            delattr(self, key)
-        del self.names[idx]
-
-    @property
-    def length(self):
-        warnings.warn("layer_choice.length is deprecated. Use `len(layer_choice)` instead.", DeprecationWarning)
-        return len(self)
-
-    def __len__(self):
-        return len(self.names)
-
-    def __iter__(self):
-        return map(lambda name: self._modules[name], self.names)
-
-    @property
-    def choices(self):
-        warnings.warn("layer_choice.choices is deprecated. Use `list(layer_choice)` instead.", DeprecationWarning)
-        return list(self)
-
-    def forward(self, *args, **kwargs):
-        """
-        Returns
-        -------
-        tuple of tensors
-            Output and selection mask. If ``return_mask`` is ``False``, only output is returned.
-        """
-        out, mask = self.mutator.on_forward_layer_choice(self, *args, **kwargs)
-        if self.return_mask:
-            return out, mask
-        return out
-
-
-class InputChoice(Mutable):
-    """
-    Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys). For beginners,
-    use ``n_candidates`` instead of ``choose_from`` is a safe option. To get the most power out of it, you might want to
-    know about ``choose_from``.
-
-    The keys in ``choose_from`` can be keys that appear in past mutables, or ``NO_KEY`` if there are no suitable ones.
-    The keys are designed to be the keys of the sources. To help mutators make better decisions,
-    mutators might be interested in how the tensors to choose from come into place. For example, the tensor is the
-    output of some operator, some node, some cell, or some module. If this operator happens to be a mutable (e.g.,
-    ``LayerChoice`` or ``InputChoice``), it has a key naturally that can be used as a source key. If it's a
-    module/submodule, it needs to be annotated with a key: that's where a :class:`MutableScope` is needed.
-
-    In the example below, ``input_choice`` is a 4-choose-any. The first 3 is semantically output of cell1, output of cell2,
-    output of cell3 with respectively. Notice that an extra max pooling is followed by cell1, indicating x1 is not
-    "actually" the direct output of cell1.
-
-    .. code-block:: python
-
-        class Cell(MutableScope):
-            pass
-
-        class Net(nn.Module):
-            def __init__(self):
-                self.cell1 = Cell("cell1")
-                self.cell2 = Cell("cell2")
-                self.op = LayerChoice([conv3x3(), conv5x5()], key="op")
-                self.input_choice = InputChoice(choose_from=["cell1", "cell2", "op", InputChoice.NO_KEY])
-
-            def forward(self, x):
-                x1 = max_pooling(self.cell1(x))
-                x2 = self.cell2(x)
-                x3 = self.op(x)
-                x4 = torch.zeros_like(x)
-                return self.input_choice([x1, x2, x3, x4])
-
-    Parameters
-    ----------
-    n_candidates : int
-        Number of inputs to choose from.
-    choose_from : list of str
-        List of source keys to choose from. At least of one of ``choose_from`` and ``n_candidates`` must be fulfilled.
-        If ``n_candidates`` has a value but ``choose_from`` is None, it will be automatically treated as ``n_candidates``
-        number of empty string.
-    n_chosen : int
-        Recommended inputs to choose. If None, mutator is instructed to select any.
-    reduction : str
-        ``mean``, ``concat``, ``sum`` or ``none``. See :class:`LayerChoice`.
-    return_mask : bool
-        If ``return_mask``, return output tensor and a mask. Otherwise return tensor only.
-    key : str
-        Key of the input choice.
-    """
-
-    NO_KEY = ""
-
-    def __init__(self, n_candidates=None, choose_from=None, n_chosen=None,
-                 reduction="sum", return_mask=False, key=None):
-        super().__init__(key=key)
-        # precondition check
-        assert n_candidates is not None or choose_from is not None, "At least one of `n_candidates` and `choose_from`" \
-                                                                    "must be not None."
-        if choose_from is not None and n_candidates is None:
-            n_candidates = len(choose_from)
-        elif choose_from is None and n_candidates is not None:
-            choose_from = [self.NO_KEY] * n_candidates
-        assert n_candidates == len(choose_from), "Number of candidates must be equal to the length of `choose_from`."
-        assert n_candidates > 0, "Number of candidates must be greater than 0."
-        assert n_chosen is None or 0 <= n_chosen <= n_candidates, "Expected selected number must be None or no more " \
-                                                                  "than number of candidates."
-
-        self.n_candidates = n_candidates
-        self.choose_from = choose_from.copy()
-        self.n_chosen = n_chosen
-        self.reduction = reduction
-        self.return_mask = return_mask
-
-    def forward(self, optional_inputs):
-        """
-        Forward method of LayerChoice.
-
-        Parameters
-        ----------
-        optional_inputs : list or dict
-            Recommended to be a dict. As a dict, inputs will be converted to a list that follows the order of
-            ``choose_from`` in initialization. As a list, inputs must follow the semantic order that is the same as
-            ``choose_from``.
-
-        Returns
-        -------
-        tuple of tensors
-            Output and selection mask. If ``return_mask`` is ``False``, only output is returned.
-        """
-        optional_input_list = optional_inputs
-        if isinstance(optional_inputs, dict):
-            optional_input_list = [optional_inputs[tag] for tag in self.choose_from]
-        assert isinstance(optional_input_list, list), \
-            "Optional input list must be a list, not a {}.".format(type(optional_input_list))
-        assert len(optional_inputs) == self.n_candidates, \
-            "Length of the input list must be equal to number of candidates."
-        out, mask = self.mutator.on_forward_input_choice(self, optional_input_list)
-        if self.return_mask:
-            return out, mask
-        return out
diff --git a/nni/nas/pytorch/mutator.py b/nni/nas/pytorch/mutator.py
deleted file mode 100644
index e1894b524..000000000
--- a/nni/nas/pytorch/mutator.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-from collections import defaultdict
-
-import numpy as np
-import torch
-
-from .base_mutator import BaseMutator
-from .mutables import LayerChoice, InputChoice
-from .utils import to_list
-
-logger = logging.getLogger(__name__)
-
-
-class Mutator(BaseMutator):
-
-    def __init__(self, model):
-        super().__init__(model)
-        self._cache = dict()
-        self._connect_all = False
-
-    def sample_search(self):
-        """
-        Override to implement this method to iterate over mutables and make decisions.
-
-        Returns
-        -------
-        dict
-            A mapping from key of mutables to decisions.
-        """
-        raise NotImplementedError
-
-    def sample_final(self):
-        """
-        Override to implement this method to iterate over mutables and make decisions that is final
-        for export and retraining.
-
-        Returns
-        -------
-        dict
-            A mapping from key of mutables to decisions.
-        """
-        raise NotImplementedError
-
-    def reset(self):
-        """
-        Reset the mutator by call the `sample_search` to resample (for search). Stores the result in a local
-        variable so that `on_forward_layer_choice` and `on_forward_input_choice` can use the decision directly.
-        """
-        self._cache = self.sample_search()
-
-    def export(self):
-        """
-        Resample (for final) and return results.
-
-        Returns
-        -------
-        dict
-            A mapping from key of mutables to decisions.
-        """
-        sampled = self.sample_final()
-        result = dict()
-        for mutable in self.mutables:
-            if not isinstance(mutable, (LayerChoice, InputChoice)):
-                # not supported as built-in
-                continue
-            result[mutable.key] = self._convert_mutable_decision_to_human_readable(mutable, sampled.pop(mutable.key))
-        if sampled:
-            raise ValueError("Unexpected keys returned from 'sample_final()': %s", list(sampled.keys()))
-        return result
-
-    def status(self):
-        """
-        Return current selection status of mutator.
-
-        Returns
-        -------
-        dict
-            A mapping from key of mutables to decisions. All weights (boolean type and float type)
-            are converted into real number values. Numpy arrays and tensors are converted into list.
-        """
-        data = dict()
-        for k, v in self._cache.items():
-            if torch.is_tensor(v):
-                v = v.detach().cpu().numpy().tolist()
-            if isinstance(v, np.ndarray):
-                v = v.astype(np.float32).tolist()
-            data[k] = v
-        return data
-
-    def graph(self, inputs):
-        """
-        Return model supernet graph.
-
-        Parameters
-        ----------
-        inputs: tuple of tensor
-            Inputs that will be feeded into the network.
-
-        Returns
-        -------
-        dict
-            Containing ``node``, in Tensorboard GraphDef format.
-            Additional key ``mutable`` is a map from key to list of modules.
-        """
-        if not torch.__version__.startswith("1.4"):
-            logger.warning("Graph is only tested with PyTorch 1.4. Other versions might not work.")
-        from nni.common.graph_utils import build_graph
-        from google.protobuf import json_format
-        # protobuf should be installed as long as tensorboard is installed
-        try:
-            self._connect_all = True
-            graph_def, _ = build_graph(self.model, inputs, verbose=False)
-            result = json_format.MessageToDict(graph_def)
-        finally:
-            self._connect_all = False
-
-        # `mutable` is to map the keys to a list of corresponding modules.
-        # A key can be linked to multiple modules, use `dedup=False` to find them all.
-        result["mutable"] = defaultdict(list)
-        for mutable in self.mutables.traverse(deduplicate=False):
-            # A module will be represent in the format of
-            # [{"type": "Net", "name": ""}, {"type": "Cell", "name": "cell1"}, {"type": "Conv2d": "name": "conv"}]
-            # which will be concatenated into Net/Cell[cell1]/Conv2d[conv] in frontend.
-            # This format is aligned with the scope name jit gives.
-            modules = mutable.name.split(".")
-            path = [
-                {"type": self.model.__class__.__name__, "name": ""}
-            ]
-            m = self.model
-            for module in modules:
-                m = getattr(m, module)
-                path.append({
-                    "type": m.__class__.__name__,
-                    "name": module
-                })
-            result["mutable"][mutable.key].append(path)
-        return result
-
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        On default, this method retrieves the decision obtained previously, and select certain operations.
-        Only operations with non-zero weight will be executed. The results will be added to a list.
-        Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`.
-
-        Parameters
-        ----------
-        mutable : nni.nas.pytorch.mutables.LayerChoice
-            Layer choice module.
-        args : list of torch.Tensor
-            Inputs
-        kwargs : dict
-            Inputs
-
-        Returns
-        -------
-        tuple of torch.Tensor and torch.Tensor
-            Output and mask.
-        """
-        if self._connect_all:
-            return self._all_connect_tensor_reduction(mutable.reduction,
-                                                      [op(*args, **kwargs) for op in mutable]), \
-                torch.ones(len(mutable)).bool()
-
-        def _map_fn(op, args, kwargs):
-            return op(*args, **kwargs)
-
-        mask = self._get_decision(mutable)
-        assert len(mask) == len(mutable), \
-            "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable))
-        out, mask = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask)
-        return self._tensor_reduction(mutable.reduction, out), mask
-
-    def on_forward_input_choice(self, mutable, tensor_list):
-        """
-        On default, this method retrieves the decision obtained previously, and select certain tensors.
-        Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`.
-
-        Parameters
-        ----------
-        mutable : nni.nas.pytorch.mutables.InputChoice
-            Input choice module.
-        tensor_list : list of torch.Tensor
-            Tensor list to apply the decision on.
-
-        Returns
-        -------
-        tuple of torch.Tensor and torch.Tensor
-            Output and mask.
-        """
-        if self._connect_all:
-            return self._all_connect_tensor_reduction(mutable.reduction, tensor_list), \
-                torch.ones(mutable.n_candidates).bool()
-        mask = self._get_decision(mutable)
-        assert len(mask) == mutable.n_candidates, \
-            "Invalid mask, expected {} to be of length {}.".format(mask, mutable.n_candidates)
-        out, mask = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask)
-        return self._tensor_reduction(mutable.reduction, out), mask
-
-    def _select_with_mask(self, map_fn, candidates, mask):
-        """
-        Select masked tensors and return a list of tensors.
-
-        Parameters
-        ----------
-        map_fn : function
-            Convert candidates to target candidates. Can be simply identity.
-        candidates : list of torch.Tensor
-            Tensor list to apply the decision on.
-        mask : list-like object
-            Can be a list, an numpy array or a tensor (recommended). Needs to
-            have the same length as ``candidates``.
-
-        Returns
-        -------
-        tuple of list of torch.Tensor and torch.Tensor
-            Output and mask.
-        """
-        if (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], bool)) or \
-                (isinstance(mask, np.ndarray) and mask.dtype == np.bool) or \
-                "BoolTensor" in mask.type():
-            out = [map_fn(*cand) for cand, m in zip(candidates, mask) if m]
-        elif (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], (float, int))) or \
-                (isinstance(mask, np.ndarray) and mask.dtype in (np.float32, np.float64, np.int32, np.int64)) or \
-                "FloatTensor" in mask.type():
-            out = [map_fn(*cand) * m for cand, m in zip(candidates, mask) if m]
-        else:
-            raise ValueError("Unrecognized mask '%s'" % mask)
-        if not torch.is_tensor(mask):
-            mask = torch.tensor(mask)  # pylint: disable=not-callable
-        return out, mask
-
-    def _tensor_reduction(self, reduction_type, tensor_list):
-        if reduction_type == "none":
-            return tensor_list
-        if not tensor_list:
-            return None  # empty. return None for now
-        if len(tensor_list) == 1:
-            return tensor_list[0]
-        if reduction_type == "sum":
-            return sum(tensor_list)
-        if reduction_type == "mean":
-            return sum(tensor_list) / len(tensor_list)
-        if reduction_type == "concat":
-            return torch.cat(tensor_list, dim=1)
-        raise ValueError("Unrecognized reduction policy: \"{}\"".format(reduction_type))
-
-    def _all_connect_tensor_reduction(self, reduction_type, tensor_list):
-        if reduction_type == "none":
-            return tensor_list
-        if reduction_type == "concat":
-            return torch.cat(tensor_list, dim=1)
-        return torch.stack(tensor_list).sum(0)
-
-    def _get_decision(self, mutable):
-        """
-        By default, this method checks whether `mutable.key` is already in the decision cache,
-        and returns the result without double-check.
-
-        Parameters
-        ----------
-        mutable : Mutable
-
-        Returns
-        -------
-        object
-        """
-        if mutable.key not in self._cache:
-            raise ValueError("\"{}\" not found in decision cache.".format(mutable.key))
-        result = self._cache[mutable.key]
-        logger.debug("Decision %s: %s", mutable.key, result)
-        return result
-
-    def _convert_mutable_decision_to_human_readable(self, mutable, sampled):
-        # Assert the existence of mutable.key in returned architecture.
-        # Also check if there is anything extra.
-        multihot_list = to_list(sampled)
-        converted = None
-        # If it's a boolean array, we can do optimization.
-        if all([t == 0 or t == 1 for t in multihot_list]):
-            if isinstance(mutable, LayerChoice):
-                assert len(multihot_list) == len(mutable), \
-                    "Results returned from 'sample_final()' (%s: %s) either too short or too long." \
-                        % (mutable.key, multihot_list)
-                # check if all modules have different names and they indeed have names
-                if len(set(mutable.names)) == len(mutable) and not all(d.isdigit() for d in mutable.names):
-                    converted = [name for i, name in enumerate(mutable.names) if multihot_list[i]]
-                else:
-                    converted = [i for i in range(len(multihot_list)) if multihot_list[i]]
-            if isinstance(mutable, InputChoice):
-                assert len(multihot_list) == mutable.n_candidates, \
-                    "Results returned from 'sample_final()' (%s: %s) either too short or too long." \
-                        % (mutable.key, multihot_list)
-                # check if all input candidates have different names
-                if len(set(mutable.choose_from)) == mutable.n_candidates:
-                    converted = [name for i, name in enumerate(mutable.choose_from) if multihot_list[i]]
-                else:
-                    converted = [i for i in range(len(multihot_list)) if multihot_list[i]]
-        if converted is not None:
-            # if only one element, then remove the bracket
-            if len(converted) == 1:
-                converted = converted[0]
-        else:
-            # do nothing
-            converted = multihot_list
-        return converted
diff --git a/nni/nas/pytorch/nasbench201/__init__.py b/nni/nas/pytorch/nasbench201/__init__.py
deleted file mode 100644
index df3f68e0b..000000000
--- a/nni/nas/pytorch/nasbench201/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .nasbench201 import NASBench201Cell
diff --git a/nni/nas/pytorch/nasbench201/nasbench201.py b/nni/nas/pytorch/nasbench201/nasbench201.py
deleted file mode 100644
index 753d9b3db..000000000
--- a/nni/nas/pytorch/nasbench201/nasbench201.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from collections import OrderedDict
-import torch.nn as nn
-from nni.nas.pytorch.mutables import LayerChoice
-
-from .nasbench201_ops import Pooling, ReLUConvBN, Zero, FactorizedReduce
-
-
-class NASBench201Cell(nn.Module):
-    """
-    Builtin cell structure of NAS Bench 201. One cell contains four nodes. The First node serves as an input node
-    accepting the output of the previous cell. And other nodes connect to all previous nodes with an edge that
-    represents an operation chosen from a set to transform the tensor from the source node to the target node.
-    Every node accepts all its inputs and adds them as its output.
-
-    Parameters
-    ---
-    cell_id: str
-        the name of this cell
-    C_in: int
-        the number of input channels of the cell
-    C_out: int
-        the number of output channels of the cell
-    stride: int
-        stride of all convolution operations in the cell
-    bn_affine: bool
-        If set to ``True``, all ``torch.nn.BatchNorm2d`` in this cell will have learnable affine parameters. Default: True
-    bn_momentum: float
-        the value used for the running_mean and running_var computation. Default: 0.1
-    bn_track_running_stats: bool
-        When set to ``True``, all ``torch.nn.BatchNorm2d`` in this cell tracks the running mean and variance. Default: True
-    """
-
-    def __init__(self, cell_id, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True):
-        super(NASBench201Cell, self).__init__()
-
-        self.NUM_NODES = 4
-        self.layers = nn.ModuleList()
-
-        OPS = lambda layer_idx: OrderedDict([
-            ("none", Zero(C_in, C_out, stride)),
-            ("avg_pool_3x3", Pooling(C_in, C_out, stride if layer_idx == 0 else 1, bn_affine, bn_momentum,
-                                     bn_track_running_stats)),
-            ("conv_3x3", ReLUConvBN(C_in, C_out, 3, stride if layer_idx == 0 else 1, 1, 1, bn_affine, bn_momentum,
-                                    bn_track_running_stats)),
-            ("conv_1x1", ReLUConvBN(C_in, C_out, 1, stride if layer_idx == 0 else 1, 0, 1, bn_affine, bn_momentum,
-                                    bn_track_running_stats)),
-            ("skip_connect", nn.Identity() if stride == 1 and C_in == C_out
-             else FactorizedReduce(C_in, C_out, stride if layer_idx == 0 else 1, bn_affine, bn_momentum,
-                                   bn_track_running_stats))
-        ])
-
-        for i in range(self.NUM_NODES):
-            node_ops = nn.ModuleList()
-            for j in range(0, i):
-                node_ops.append(LayerChoice(OPS(j), key="%d_%d" % (j, i), reduction="mean"))
-            self.layers.append(node_ops)
-        self.in_dim = C_in
-        self.out_dim = C_out
-        self.cell_id = cell_id
-
-    def forward(self, input): # pylint: disable=W0622
-        """
-        Parameters
-        ---
-        input: torch.tensor
-            the output of the previous layer
-        """
-        nodes = [input]
-        for i in range(1, self.NUM_NODES):
-            node_feature = sum(self.layers[i][k](nodes[k]) for k in range(i))
-            nodes.append(node_feature)
-        return nodes[-1]
diff --git a/nni/nas/pytorch/nasbench201/nasbench201_ops.py b/nni/nas/pytorch/nasbench201/nasbench201_ops.py
deleted file mode 100644
index 633be3220..000000000
--- a/nni/nas/pytorch/nasbench201/nasbench201_ops.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-
-
-class ReLUConvBN(nn.Module):
-    """
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    stride: int
-        stride of the convolution
-    padding: int
-        zero-padding added to both sides of the input
-    dilation: int
-        spacing between kernel elements
-    bn_affine: bool
-        If set to ``True``, ``torch.nn.BatchNorm2d`` will have learnable affine parameters. Default: True
-    bn_momentun: float
-        the value used for the running_mean and running_var computation. Default: 0.1
-    bn_track_running_stats: bool
-        When set to ``True``, ``torch.nn.BatchNorm2d`` tracks the running mean and variance. Default: True
-    """
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation,
-                 bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True):
-        super(ReLUConvBN, self).__init__()
-        self.op = nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
-                      padding=padding, dilation=dilation, bias=False),
-            nn.BatchNorm2d(C_out, affine=bn_affine, momentum=bn_momentum,
-                           track_running_stats=bn_track_running_stats)
-        )
-
-    def forward(self, x):
-        """
-        Parameters
-        ---
-        x: torch.Tensor
-            input tensor
-        """
-        return self.op(x)
-
-
-class Pooling(nn.Module):
-    """
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    stride: int
-        stride of the convolution
-    bn_affine: bool
-        If set to ``True``, ``torch.nn.BatchNorm2d`` will have learnable affine parameters. Default: True
-    bn_momentun: float
-        the value used for the running_mean and running_var computation. Default: 0.1
-    bn_track_running_stats: bool
-        When set to ``True``, ``torch.nn.BatchNorm2d`` tracks the running mean and variance. Default: True
-    """
-    def __init__(self, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1, bn_track_running_stats=True):
-        super(Pooling, self).__init__()
-        if C_in == C_out:
-            self.preprocess = None
-        else:
-            self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 0,
-                                         bn_affine, bn_momentum, bn_track_running_stats)
-        self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
-
-    def forward(self, x):
-        """
-        Parameters
-        ---
-        x: torch.Tensor
-            input tensor
-        """
-        if self.preprocess:
-            x = self.preprocess(x)
-        return self.op(x)
-
-
-class Zero(nn.Module):
-    """
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    stride: int
-        stride of the convolution
-    """
-    def __init__(self, C_in, C_out, stride):
-        super(Zero, self).__init__()
-        self.C_in = C_in
-        self.C_out = C_out
-        self.stride = stride
-        self.is_zero = True
-
-    def forward(self, x):
-        """
-        Parameters
-        ---
-        x: torch.Tensor
-            input tensor
-        """
-        if self.C_in == self.C_out:
-            if self.stride == 1:
-                return x.mul(0.)
-            else:
-                return x[:, :, ::self.stride, ::self.stride].mul(0.)
-        else:
-            shape = list(x.shape)
-            shape[1] = self.C_out
-            zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
-            return zeros
-
-
-class FactorizedReduce(nn.Module):
-    def __init__(self, C_in, C_out, stride, bn_affine=True, bn_momentum=0.1,
-                 bn_track_running_stats=True):
-        super(FactorizedReduce, self).__init__()
-        self.stride = stride
-        self.C_in = C_in
-        self.C_out = C_out
-        self.relu = nn.ReLU(inplace=False)
-        if stride == 2:
-            C_outs = [C_out // 2, C_out - C_out // 2]
-            self.convs = nn.ModuleList()
-            for i in range(2):
-                self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
-            self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
-        else:
-            raise ValueError("Invalid stride : {:}".format(stride))
-        self.bn = nn.BatchNorm2d(C_out, affine=bn_affine, momentum=bn_momentum,
-                                 track_running_stats=bn_track_running_stats)
-
-    def forward(self, x):
-        x = self.relu(x)
-        y = self.pad(x)
-        out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
diff --git a/nni/nas/pytorch/search_space_zoo/__init__.py b/nni/nas/pytorch/search_space_zoo/__init__.py
deleted file mode 100644
index b31d4199a..000000000
--- a/nni/nas/pytorch/search_space_zoo/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .darts_cell import DartsCell
-from .enas_cell import ENASMicroLayer
-from .enas_cell import ENASMacroLayer
-from .enas_cell import ENASMacroGeneralModel
diff --git a/nni/nas/pytorch/search_space_zoo/darts_cell.py b/nni/nas/pytorch/search_space_zoo/darts_cell.py
deleted file mode 100644
index 53fca5940..000000000
--- a/nni/nas/pytorch/search_space_zoo/darts_cell.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from collections import OrderedDict
-
-import torch
-import torch.nn as nn
-from nni.nas.pytorch import mutables
-
-from .darts_ops import PoolBN, SepConv, DilConv, FactorizedReduce, DropPath, StdConv
-
-
-class Node(nn.Module):
-    def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect):
-        """
-        builtin Darts Node structure
-
-        Parameters
-        ---
-        node_id: str
-        num_prev_nodes: int
-            the number of previous nodes in this cell
-        channels: int
-            output channels
-        num_downsample_connect: int
-            downsample the input node if this cell is reduction cell
-        """
-        super().__init__()
-        self.ops = nn.ModuleList()
-        choice_keys = []
-        for i in range(num_prev_nodes):
-            stride = 2 if i < num_downsample_connect else 1
-            choice_keys.append("{}_p{}".format(node_id, i))
-            self.ops.append(
-                mutables.LayerChoice(OrderedDict([
-                    ("maxpool", PoolBN('max', channels, 3, stride, 1, affine=False)),
-                    ("avgpool", PoolBN('avg', channels, 3, stride, 1, affine=False)),
-                    ("skipconnect",
-                     nn.Identity() if stride == 1 else FactorizedReduce(channels, channels, affine=False)),
-                    ("sepconv3x3", SepConv(channels, channels, 3, stride, 1, affine=False)),
-                    ("sepconv5x5", SepConv(channels, channels, 5, stride, 2, affine=False)),
-                    ("dilconv3x3", DilConv(channels, channels, 3, stride, 2, 2, affine=False)),
-                    ("dilconv5x5", DilConv(channels, channels, 5, stride, 4, 2, affine=False))
-                ]), key=choice_keys[-1]))
-        self.drop_path = DropPath()
-        self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id))
-
-    def forward(self, prev_nodes):
-        assert len(self.ops) == len(prev_nodes)
-        out = [op(node) for op, node in zip(self.ops, prev_nodes)]
-        out = [self.drop_path(o) if o is not None else None for o in out]
-        return self.input_switch(out)
-
-
-class DartsCell(nn.Module):
-    """
-    Builtin Darts Cell structure. There are ``n_nodes`` nodes in one cell, in which the first two nodes' values are
-    fixed to the results of previous previous cell and previous cell respectively. One node will connect all
-    the nodes after with predefined operations in a mutable way. The last node accepts five inputs from nodes
-    before and it concats all inputs in channels as the output of the current cell, and the number of output
-    channels is ``n_nodes`` times ``channels``.
-
-    Parameters
-    ---
-    n_nodes: int
-        the number of nodes contained in this cell
-    channels_pp: int
-        the number of previous previous cell's output channels
-    channels_p: int
-        the number of previous cell's output channels
-    channels: int
-        the number of output channels for each node
-    reduction_p: bool
-        Is previous cell a reduction cell
-    reduction: bool
-        is current cell a reduction cell
-    """
-    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
-        super().__init__()
-        self.reduction = reduction
-        self.n_nodes = n_nodes
-
-        # If previous cell is reduction cell, current input size does not match with
-        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
-        if reduction_p:
-            self.preproc0 = FactorizedReduce(channels_pp, channels, affine=False)
-        else:
-            self.preproc0 = StdConv(channels_pp, channels, 1, 1, 0, affine=False)
-        self.preproc1 = StdConv(channels_p, channels, 1, 1, 0, affine=False)
-
-        # generate dag
-        self.mutable_ops = nn.ModuleList()
-        for depth in range(2, self.n_nodes + 2):
-            self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth),
-                                         depth, channels, 2 if reduction else 0))
-
-    def forward(self, pprev, prev):
-        """
-        Parameters
-        ---
-        pprev: torch.Tensor
-            the output of the previous previous layer
-        prev: torch.Tensor
-            the output of the previous layer
-        """
-        tensors = [self.preproc0(pprev), self.preproc1(prev)]
-        for node in self.mutable_ops:
-            cur_tensor = node(tensors)
-            tensors.append(cur_tensor)
-
-        output = torch.cat(tensors[2:], dim=1)
-        return output
diff --git a/nni/nas/pytorch/search_space_zoo/darts_ops.py b/nni/nas/pytorch/search_space_zoo/darts_ops.py
deleted file mode 100644
index ce5410cfb..000000000
--- a/nni/nas/pytorch/search_space_zoo/darts_ops.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-
-
-class DropPath(nn.Module):
-    def __init__(self, p=0.):
-        """
-        Drop path with probability.
-
-        Parameters
-        ----------
-        p : float
-            Probability of an path to be zeroed.
-        """
-        super().__init__()
-        self.p = p
-
-    def forward(self, x):
-        if self.training and self.p > 0.:
-            keep_prob = 1. - self.p
-            # per data point mask
-            mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
-            return x / keep_prob * mask
-
-        return x
-
-
-class PoolBN(nn.Module):
-    """
-    AvgPool or MaxPool with BN. ``pool_type`` must be ``max`` or ``avg``.
-
-    Parameters
-    ---
-    pool_type: str
-        choose operation
-    C: int
-        number of channels
-    kernal_size: int
-	    size of the convolving kernel
-    stride: int
-	    stride of the convolution
-    padding: int
-	    zero-padding added to both sides of the input
-    affine: bool
-        is using affine in BatchNorm
-    """
-
-    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        if pool_type.lower() == 'max':
-            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
-        elif pool_type.lower() == 'avg':
-            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
-        else:
-            raise ValueError()
-
-        self.bn = nn.BatchNorm2d(C, affine=affine)
-
-    def forward(self, x):
-        out = self.pool(x)
-        out = self.bn(out)
-        return out
-
-
-class StdConv(nn.Sequential):
-    """
-    Standard conv: ReLU - Conv - BN
-
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    kernel_size: int
-        size of the convolution kernel
-    padding:
-        zero-padding added to both sides of the input
-    affine: bool
-        is using affine in BatchNorm
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential
-        for idx, ops in enumerate((nn.ReLU(), nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
-                                   nn.BatchNorm2d(C_out, affine=affine))):
-            self.add_module(str(idx), ops)
-
-
-class FacConv(nn.Module):
-    """
-    Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN
-    """
-
-    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
-            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class DilConv(nn.Module):
-    """
-    (Dilated) depthwise separable conv.
-    ReLU - (Dilated) depthwise separable - Pointwise - BN.
-    If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field.
-
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    kernal_size:
-        size of the convolving kernel
-    padding:
-        zero-padding added to both sides of the input
-    dilation: int
-        spacing between kernel elements.
-    affine: bool
-        is using affine in BatchNorm
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
-                      bias=False),
-            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class SepConv(nn.Module):
-    """
-    Depthwise separable conv.
-    DilConv(dilation=1) * 2.
-
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    kernal_size:
-        size of the convolving kernel
-    padding:
-        zero-padding added to both sides of the input
-    dilation: int
-        spacing between kernel elements.
-    affine: bool
-        is using affine in BatchNorm
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
-            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class FactorizedReduce(nn.Module):
-    """
-    Reduce feature map size by factorized pointwise (stride=2).
-    """
-
-    def __init__(self, C_in, C_out, affine=True):
-        super().__init__()
-        self.relu = nn.ReLU()
-        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-
-    def forward(self, x):
-        x = self.relu(x)
-        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
diff --git a/nni/nas/pytorch/search_space_zoo/enas_cell.py b/nni/nas/pytorch/search_space_zoo/enas_cell.py
deleted file mode 100644
index de57d55e2..000000000
--- a/nni/nas/pytorch/search_space_zoo/enas_cell.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from nni.nas.pytorch import mutables
-from .enas_ops import FactorizedReduce, StdConv, SepConvBN, Pool, ConvBranch, PoolBranch
-
-
-class Cell(nn.Module):
-    def __init__(self, cell_name, prev_labels, channels):
-        super().__init__()
-        self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True,
-                                                 key=cell_name + "_input")
-        self.op_choice = mutables.LayerChoice([
-            SepConvBN(channels, channels, 3, 1),
-            SepConvBN(channels, channels, 5, 2),
-            Pool("avg", 3, 1, 1),
-            Pool("max", 3, 1, 1),
-            nn.Identity()
-        ], key=cell_name + "_op")
-
-    def forward(self, prev_layers):
-        chosen_input, chosen_mask = self.input_choice(prev_layers)
-        cell_out = self.op_choice(chosen_input)
-        return cell_out, chosen_mask
-
-
-class Node(mutables.MutableScope):
-    def __init__(self, node_name, prev_node_names, channels):
-        super().__init__(node_name)
-        self.cell_x = Cell(node_name + "_x", prev_node_names, channels)
-        self.cell_y = Cell(node_name + "_y", prev_node_names, channels)
-
-    def forward(self, prev_layers):
-        out_x, mask_x = self.cell_x(prev_layers)
-        out_y, mask_y = self.cell_y(prev_layers)
-        return out_x + out_y, mask_x | mask_y
-
-
-class Calibration(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.process = None
-        if in_channels != out_channels:
-            self.process = StdConv(in_channels, out_channels)
-
-    def forward(self, x):
-        if self.process is None:
-            return x
-        return self.process(x)
-
-
-class ENASMicroLayer(nn.Module):
-    """
-    Builtin EnasMicroLayer. Micro search designs only one building block whose architecture is repeated
-    throughout the final architecture. A cell has ``num_nodes`` nodes and searches the topology and
-    operations among them in RL way. The first two nodes in a layer stand for the outputs from previous
-    previous layer and previous layer respectively. For the following nodes, the controller chooses
-    two previous nodes and applies two operations respectively for each node. Nodes that are not served
-    as input for any other node are viewed as the output of the layer. If there are multiple output nodes,
-    the model will calculate the average of these nodes as the layer output. Every node's output has ``out_channels``
-    channels so the result of the layer has the same number of channels as each node.
-
-    Parameters
-    ---
-    num_nodes: int
-        the number of nodes contained in this layer
-    in_channles_pp: int
-        the number of previous previous layer's output channels
-    in_channels_p: int
-        the number of previous layer's output channels
-    out_channels: int
-        output channels of this layer
-    reduction: bool
-        is reduction operation empolyed before this layer
-    """
-    def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction):
-        super().__init__()
-        self.reduction = reduction
-        if self.reduction:
-            self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False)
-            self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False)
-            in_channels_pp = in_channels_p = out_channels
-        self.preproc0 = Calibration(in_channels_pp, out_channels)
-        self.preproc1 = Calibration(in_channels_p, out_channels)
-
-        self.num_nodes = num_nodes
-        name_prefix = "reduce" if reduction else "normal"
-        self.nodes = nn.ModuleList()
-        node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY]
-        for i in range(num_nodes):
-            node_labels.append("{}_node_{}".format(name_prefix, i))
-            self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels))
-        self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1),
-                                         requires_grad=True)
-        self.bn = nn.BatchNorm2d(out_channels, affine=False)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.kaiming_normal_(self.final_conv_w)
-
-    def forward(self, pprev, prev):
-        """
-        Parameters
-        ---
-        pprev: torch.Tensor
-            the output of the previous previous layer
-        prev: torch.Tensor
-            the output of the previous layer
-        """
-        if self.reduction:
-            pprev, prev = self.reduce0(pprev), self.reduce1(prev)
-        pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev)
-
-        prev_nodes_out = [pprev_, prev_]
-        nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device)
-        for i in range(self.num_nodes):
-            node_out, mask = self.nodes[i](prev_nodes_out)
-            nodes_used_mask[:mask.size(0)] |= mask.to(node_out.device)
-            prev_nodes_out.append(node_out)
-
-        unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1)
-        unused_nodes = F.relu(unused_nodes)
-        conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :]
-        conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1)
-        out = F.conv2d(unused_nodes, conv_weight)
-        return prev, self.bn(out)
-
-
-class ENASMacroLayer(mutables.MutableScope):
-    """
-    Builtin ENAS Marco Layer. With search space changing to layer level, the controller decides
-    what operation is employed and the previous layer to connect to for skip connections. The model
-    is made up of the same layers but the choice of each layer may be different.
-
-    Parameters
-    ---
-    key: str
-        the name of this layer
-    prev_labels: str
-        names of all previous layers
-    in_filters: int
-        the number of input channels
-    out_filters:
-        the number of output channels
-    """
-    def __init__(self, key, prev_labels, in_filters, out_filters):
-        super().__init__(key)
-        self.in_filters = in_filters
-        self.out_filters = out_filters
-        self.mutable = mutables.LayerChoice([
-            ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False),
-            ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True),
-            ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False),
-            ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True),
-            PoolBranch('avg', in_filters, out_filters, 3, 1, 1),
-            PoolBranch('max', in_filters, out_filters, 3, 1, 1)
-        ])
-        if prev_labels:
-            self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None)
-        else:
-            self.skipconnect = None
-        self.batch_norm = nn.BatchNorm2d(out_filters, affine=False)
-
-    def forward(self, prev_list):
-        """
-        Parameters
-        ---
-        prev_list: list
-            The cell selects the last element of the list as input and applies an operation on it.
-            The cell chooses none/one/multiple tensor(s) as SkipConnect(s) from the list excluding
-            the last element.
-        """
-        out = self.mutable(prev_list[-1])
-        if self.skipconnect is not None:
-            connection = self.skipconnect(prev_list[:-1])
-            if connection is not None:
-                out += connection
-        return self.batch_norm(out)
-
-
-class ENASMacroGeneralModel(nn.Module):
-    """
-    The network is made up by stacking ENASMacroLayer. The Macro search space contains these layers.
-    Each layer chooses an operation from predefined ones and SkipConnect then forms a network.
-
-    Parameters
-    ---
-    num_layers: int
-        The number of layers contained in the network.
-    out_filters: int
-        The number of each layer's output channels.
-    in_channel: int
-        The number of input's channels.
-    num_classes: int
-        The number of classes for classification.
-    dropout_rate: float
-        Dropout layer's dropout rate before the final dense layer.
-    """
-    def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10,
-                 dropout_rate=0.0):
-        super().__init__()
-        self.num_layers = num_layers
-        self.num_classes = num_classes
-        self.out_filters = out_filters
-
-        self.stem = nn.Sequential(
-            nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False),
-            nn.BatchNorm2d(out_filters)
-        )
-
-        pool_distance = self.num_layers // 3
-        self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1]
-        self.dropout_rate = dropout_rate
-        self.dropout = nn.Dropout(self.dropout_rate)
-
-        self.layers = nn.ModuleList()
-        self.pool_layers = nn.ModuleList()
-        labels = []
-        for layer_id in range(self.num_layers):
-            labels.append("layer_{}".format(layer_id))
-            if layer_id in self.pool_layers_idx:
-                self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters))
-            self.layers.append(ENASMacroLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters))
-
-        self.gap = nn.AdaptiveAvgPool2d(1)
-        self.dense = nn.Linear(self.out_filters, self.num_classes)
-
-    def forward(self, x):
-        """
-        Parameters
-        ---
-        x: torch.Tensor
-            the input of the network
-        """
-        bs = x.size(0)
-        cur = self.stem(x)
-
-        layers = [cur]
-
-        for layer_id in range(self.num_layers):
-            cur = self.layers[layer_id](layers)
-            layers.append(cur)
-            if layer_id in self.pool_layers_idx:
-                for i, layer in enumerate(layers):
-                    layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer)
-                cur = layers[-1]
-
-        cur = self.gap(cur).view(bs, -1)
-        cur = self.dropout(cur)
-        logits = self.dense(cur)
-        return logits
diff --git a/nni/nas/pytorch/search_space_zoo/enas_ops.py b/nni/nas/pytorch/search_space_zoo/enas_ops.py
deleted file mode 100644
index 21ecc2da7..000000000
--- a/nni/nas/pytorch/search_space_zoo/enas_ops.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-
-
-class StdConv(nn.Module):
-    def __init__(self, C_in, C_out):
-        super(StdConv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=False),
-            nn.ReLU()
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class PoolBranch(nn.Module):
-    """
-    Pooling structure for Macro search. First pass through a 1x1 Conv, then pooling operation followed by BatchNorm2d.
-
-    Parameters
-    ---
-    pool_type: str
-        only accept ``max`` for MaxPool and ``avg`` for AvgPool
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    kernal_size: int
-        size of the convolving kernel
-    stride: int
-	    stride of the convolution
-    padding: int
-	    zero-padding added to both sides of the input
-    """
-    def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False):
-        super().__init__()
-        self.preproc = StdConv(C_in, C_out)
-        self.pool = Pool(pool_type, kernel_size, stride, padding)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-
-    def forward(self, x):
-        out = self.preproc(x)
-        out = self.pool(out)
-        out = self.bn(out)
-        return out
-
-
-class SeparableConv(nn.Module):
-    def __init__(self, C_in, C_out, kernel_size, stride, padding):
-        super(SeparableConv, self).__init__()
-        self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride,
-                                   groups=C_in, bias=False)
-        self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        out = self.depthwise(x)
-        out = self.pointwise(out)
-        return out
-
-
-class ConvBranch(nn.Module):
-    """
-    Conv structure for Macro search. First pass through a 1x1 Conv,
-    then Conv operation with kernal_size equals 3 or 5 followed by BatchNorm and ReLU.
-
-    Parameters
-    ---
-    C_in: int
-        the number of input channels
-    C_out: int
-        the number of output channels
-    kernal_size: int
-        size of the convolving kernel
-    stride: int
-	    stride of the convolution
-    padding: int
-	    zero-padding added to both sides of the input
-    separable: True
-        is separable Conv is used
-    """
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, separable):
-        super(ConvBranch, self).__init__()
-        self.preproc = StdConv(C_in, C_out)
-        if separable:
-            self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding)
-        else:
-            self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding)
-        self.postproc = nn.Sequential(
-            nn.BatchNorm2d(C_out, affine=False),
-            nn.ReLU()
-        )
-
-    def forward(self, x):
-        out = self.preproc(x)
-        out = self.conv(out)
-        out = self.postproc(out)
-        return out
-
-
-class FactorizedReduce(nn.Module):
-    def __init__(self, C_in, C_out, affine=False):
-        super().__init__()
-        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-
-    def forward(self, x):
-        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
-
-
-class Pool(nn.Module):
-    """
-    Pooling structure
-
-    Parameters
-    ---
-    pool_type: str
-        only accept ``max`` for MaxPool and ``avg`` for AvgPool
-    kernal_size: int
-        size of the convolving kernel
-    stride: int
-	    stride of the convolution
-    padding: int
-	    zero-padding added to both sides of the input
-    """
-    def __init__(self, pool_type, kernel_size, stride, padding):
-        super().__init__()
-        if pool_type.lower() == 'max':
-            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
-        elif pool_type.lower() == 'avg':
-            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
-        else:
-            raise ValueError()
-
-    def forward(self, x):
-        return self.pool(x)
-
-
-class SepConvBN(nn.Module):
-    """
-    Implement SepConv followed by BatchNorm. The structure is ReLU ==> SepConv ==> BN.
-
-    Parameters
-    ---
-    C_in: int
-        the number of imput channels
-    C_out: int
-        the number of output channels
-    kernal_size: int
-        size of the convolving kernel
-    padding: int
-        zero-padding added to both sides of the input
-    """
-    def __init__(self, C_in, C_out, kernel_size, padding):
-        super().__init__()
-        self.relu = nn.ReLU()
-        self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding)
-        self.bn = nn.BatchNorm2d(C_out, affine=True)
-
-    def forward(self, x):
-        x = self.relu(x)
-        x = self.conv(x)
-        x = self.bn(x)
-        return x
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
deleted file mode 100644
index 6a3881177..000000000
--- a/nni/nas/pytorch/trainer.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-import os
-import time
-from abc import abstractmethod
-
-import torch
-
-from .base_trainer import BaseTrainer
-
-_logger = logging.getLogger(__name__)
-
-
-class TorchTensorEncoder(json.JSONEncoder):
-    def default(self, o):  # pylint: disable=method-hidden
-        if isinstance(o, torch.Tensor):
-            olist = o.tolist()
-            if "bool" not in o.type().lower() and all(map(lambda d: d == 0 or d == 1, olist)):
-                _logger.warning("Every element in %s is either 0 or 1. "
-                                "You might consider convert it into bool.", olist)
-            return olist
-        return super().default(o)
-
-
-class Trainer(BaseTrainer):
-    """
-    A trainer with some helper functions implemented. To implement a new trainer,
-    users need to implement :meth:`train_one_epoch`, :meth:`validate_one_epoch` and :meth:`checkpoint`.
-
-    Parameters
-    ----------
-    model : nn.Module
-        Model with mutables.
-    mutator : BaseMutator
-        A mutator object that has been initialized with the model.
-    loss : callable
-        Called with logits and targets. Returns a loss tensor.
-        See `PyTorch loss functions`_ for examples.
-    metrics : callable
-        Called with logits and targets. Returns a dict that maps metrics keys to metrics data. For example,
-
-        .. code-block:: python
-
-            def metrics_fn(output, target):
-                return {"acc1": accuracy(output, target, topk=1), "acc5": accuracy(output, target, topk=5)}
-
-    optimizer : Optimizer
-        Optimizer that optimizes the model.
-    num_epochs : int
-        Number of epochs of training.
-    dataset_train : torch.utils.data.Dataset
-        Dataset of training. If not otherwise specified, ``dataset_train`` and ``dataset_valid`` should be standard
-        PyTorch Dataset. See `torch.utils.data`_ for examples.
-    dataset_valid : torch.utils.data.Dataset
-        Dataset of validation/testing.
-    batch_size : int
-        Batch size.
-    workers : int
-        Number of workers used in data preprocessing.
-    device : torch.device
-        Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
-        automatic detects GPU and selects GPU first.
-    log_frequency : int
-        Number of mini-batches to log metrics.
-    callbacks : list of Callback
-        Callbacks to plug into the trainer. See Callbacks.
-
-
-    .. _`PyTorch loss functions`: https://pytorch.org/docs/stable/nn.html#loss-functions
-    .. _`torch.utils.data`: https://pytorch.org/docs/stable/data.html
-    """
-    def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
-                 dataset_train, dataset_valid, batch_size, workers, device, log_frequency, callbacks):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
-        self.model = model
-        self.mutator = mutator
-        self.loss = loss
-
-        self.metrics = metrics
-        self.optimizer = optimizer
-
-        self.model.to(self.device)
-        self.mutator.to(self.device)
-        self.loss.to(self.device)
-
-        self.num_epochs = num_epochs
-        self.dataset_train = dataset_train
-        self.dataset_valid = dataset_valid
-        self.batch_size = batch_size
-        self.workers = workers
-        self.log_frequency = log_frequency
-        self.log_dir = os.path.join("logs", str(time.time()))
-        os.makedirs(self.log_dir, exist_ok=True)
-        self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
-        self.callbacks = callbacks if callbacks is not None else []
-        for callback in self.callbacks:
-            callback.build(self.model, self.mutator, self)
-
-    @abstractmethod
-    def train_one_epoch(self, epoch):
-        """
-        Train one epoch.
-
-        Parameters
-        ----------
-        epoch : int
-            Epoch number starting from 0.
-        """
-        pass
-
-    @abstractmethod
-    def validate_one_epoch(self, epoch):
-        """
-        Validate one epoch.
-
-        Parameters
-        ----------
-        epoch : int
-            Epoch number starting from 0.
-        """
-        pass
-
-    def train(self, validate=True):
-        """
-        Train ``num_epochs``.
-        Trigger callbacks at the start and the end of each epoch.
-
-        Parameters
-        ----------
-        validate : bool
-            If ``true``, will do validation every epoch.
-        """
-        for epoch in range(self.num_epochs):
-            for callback in self.callbacks:
-                callback.on_epoch_begin(epoch)
-
-            # training
-            _logger.info("Epoch %d Training", epoch + 1)
-            self.train_one_epoch(epoch)
-
-            if validate:
-                # validation
-                _logger.info("Epoch %d Validating", epoch + 1)
-                self.validate_one_epoch(epoch)
-
-            for callback in self.callbacks:
-                callback.on_epoch_end(epoch)
-
-    def validate(self):
-        """
-        Do one validation.
-        """
-        self.validate_one_epoch(-1)
-
-    def export(self, file):
-        """
-        Call ``mutator.export()`` and dump the architecture to ``file``.
-
-        Parameters
-        ----------
-        file : str
-            A file path. Expected to be a JSON.
-        """
-        mutator_export = self.mutator.export()
-        with open(file, "w") as f:
-            json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-
-    def checkpoint(self):
-        """
-        Return trainer checkpoint.
-        """
-        raise NotImplementedError("Not implemented yet")
-
-    def enable_visualization(self):
-        """
-        Enable visualization. Write graph and training log to folder ``logs/<timestamp>``.
-        """
-        sample = None
-        for x, _ in self.train_loader:
-            sample = x.to(self.device)[:2]
-            break
-        if sample is None:
-            _logger.warning("Sample is %s.", sample)
-        _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir)
-        with open(os.path.join(self.log_dir, "graph.json"), "w") as f:
-            json.dump(self.mutator.graph(sample), f)
-        self.visualization_enabled = True
-
-    def _write_graph_status(self):
-        if hasattr(self, "visualization_enabled") and self.visualization_enabled:
-            print(json.dumps(self.mutator.status()), file=self.status_writer, flush=True)
diff --git a/nni/nas/pytorch/utils.py b/nni/nas/pytorch/utils.py
deleted file mode 100644
index a3f5aabfb..000000000
--- a/nni/nas/pytorch/utils.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-from collections import OrderedDict
-
-import numpy as np
-import torch
-
-_counter = 0
-
-_logger = logging.getLogger(__name__)
-
-
-def global_mutable_counting():
-    """
-    A program level counter starting from 1.
-    """
-    global _counter
-    _counter += 1
-    return _counter
-
-
-def _reset_global_mutable_counting():
-    """
-    Reset the global mutable counting to count from 1. Useful when defining multiple models with default keys.
-    """
-    global _counter
-    _counter = 0
-
-
-def to_device(obj, device):
-    """
-    Move a tensor, tuple, list, or dict onto device.
-    """
-    if torch.is_tensor(obj):
-        return obj.to(device)
-    if isinstance(obj, tuple):
-        return tuple(to_device(t, device) for t in obj)
-    if isinstance(obj, list):
-        return [to_device(t, device) for t in obj]
-    if isinstance(obj, dict):
-        return {k: to_device(v, device) for k, v in obj.items()}
-    if isinstance(obj, (int, float, str)):
-        return obj
-    raise ValueError("'%s' has unsupported type '%s'" % (obj, type(obj)))
-
-
-def to_list(arr):
-    if torch.is_tensor(arr):
-        return arr.cpu().numpy().tolist()
-    if isinstance(arr, np.ndarray):
-        return arr.tolist()
-    if isinstance(arr, (list, tuple)):
-        return list(arr)
-    return arr
-
-
-class AverageMeterGroup:
-    """
-    Average meter group for multiple average meters.
-    """
-
-    def __init__(self):
-        self.meters = OrderedDict()
-
-    def update(self, data):
-        """
-        Update the meter group with a dict of metrics.
-        Non-exist average meters will be automatically created.
-        """
-        for k, v in data.items():
-            if k not in self.meters:
-                self.meters[k] = AverageMeter(k, ":4f")
-            self.meters[k].update(v)
-
-    def __getattr__(self, item):
-        return self.meters[item]
-
-    def __getitem__(self, item):
-        return self.meters[item]
-
-    def __str__(self):
-        return "  ".join(str(v) for v in self.meters.values())
-
-    def summary(self):
-        """
-        Return a summary string of group data.
-        """
-        return "  ".join(v.summary() for v in self.meters.values())
-
-
-class AverageMeter:
-    """
-    Computes and stores the average and current value.
-
-    Parameters
-    ----------
-    name : str
-        Name to display.
-    fmt : str
-        Format string to print the values.
-    """
-
-    def __init__(self, name, fmt=':f'):
-        self.name = name
-        self.fmt = fmt
-        self.reset()
-
-    def reset(self):
-        """
-        Reset the meter.
-        """
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        """
-        Update with value and weight.
-
-        Parameters
-        ----------
-        val : float or int
-            The new value to be accounted in.
-        n : int
-            The weight of the new value.
-        """
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)
-
-    def summary(self):
-        fmtstr = '{name}: {avg' + self.fmt + '}'
-        return fmtstr.format(**self.__dict__)
-
-
-class StructuredMutableTreeNode:
-    """
-    A structured representation of a search space.
-    A search space comes with a root (with `None` stored in its `mutable`), and a bunch of children in its `children`.
-    This tree can be seen as a "flattened" version of the module tree. Since nested mutable entity is not supported yet,
-    the following must be true: each subtree corresponds to a ``MutableScope`` and each leaf corresponds to a
-    ``Mutable`` (other than ``MutableScope``).
-
-    Parameters
-    ----------
-    mutable : nni.nas.pytorch.mutables.Mutable
-        The mutable that current node is linked with.
-    """
-
-    def __init__(self, mutable):
-        self.mutable = mutable
-        self.children = []
-
-    def add_child(self, mutable):
-        """
-        Add a tree node to the children list of current node.
-        """
-        self.children.append(StructuredMutableTreeNode(mutable))
-        return self.children[-1]
-
-    def type(self):
-        """
-        Return the ``type`` of mutable content.
-        """
-        return type(self.mutable)
-
-    def __iter__(self):
-        return self.traverse()
-
-    def traverse(self, order="pre", deduplicate=True, memo=None):
-        """
-        Return a generator that generates a list of mutables in this tree.
-
-        Parameters
-        ----------
-        order : str
-            pre or post. If pre, current mutable is yield before children. Otherwise after.
-        deduplicate : bool
-            If true, mutables with the same key will not appear after the first appearance.
-        memo : dict
-            An auxiliary dict that memorize keys seen before, so that deduplication is possible.
-
-        Returns
-        -------
-        generator of Mutable
-        """
-        if memo is None:
-            memo = set()
-        assert order in ["pre", "post"]
-        if order == "pre":
-            if self.mutable is not None:
-                if not deduplicate or self.mutable.key not in memo:
-                    memo.add(self.mutable.key)
-                    yield self.mutable
-        for child in self.children:
-            for m in child.traverse(order=order, deduplicate=deduplicate, memo=memo):
-                yield m
-        if order == "post":
-            if self.mutable is not None:
-                if not deduplicate or self.mutable.key not in memo:
-                    memo.add(self.mutable.key)
-                    yield self.mutable
diff --git a/nni/retiarii/strategy/base.py b/nni/nas/strategy/base.py
similarity index 100%
rename from nni/retiarii/strategy/base.py
rename to nni/nas/strategy/base.py
diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/nas/strategy/bruteforce.py
similarity index 100%
rename from nni/retiarii/strategy/bruteforce.py
rename to nni/nas/strategy/bruteforce.py
diff --git a/nni/retiarii/strategy/local_debug_strategy.py b/nni/nas/strategy/debug.py
similarity index 100%
rename from nni/retiarii/strategy/local_debug_strategy.py
rename to nni/nas/strategy/debug.py
diff --git a/nni/retiarii/strategy/evolution.py b/nni/nas/strategy/evolution.py
similarity index 100%
rename from nni/retiarii/strategy/evolution.py
rename to nni/nas/strategy/evolution.py
diff --git a/nni/retiarii/strategy/tpe_strategy.py b/nni/nas/strategy/hpo.py
similarity index 100%
rename from nni/retiarii/strategy/tpe_strategy.py
rename to nni/nas/strategy/hpo.py
diff --git a/nni/retiarii/strategy/oneshot.py b/nni/nas/strategy/oneshot.py
similarity index 100%
rename from nni/retiarii/strategy/oneshot.py
rename to nni/nas/strategy/oneshot.py
diff --git a/nni/retiarii/strategy/rl.py b/nni/nas/strategy/rl.py
similarity index 100%
rename from nni/retiarii/strategy/rl.py
rename to nni/nas/strategy/rl.py
diff --git a/nni/retiarii/strategy/utils.py b/nni/nas/strategy/utils.py
similarity index 100%
rename from nni/retiarii/strategy/utils.py
rename to nni/nas/strategy/utils.py
diff --git a/nni/nas/tensorflow/__init__.py b/nni/nas/tensorflow/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/nni/nas/tensorflow/base_mutator.py b/nni/nas/tensorflow/base_mutator.py
deleted file mode 100644
index 860680f19..000000000
--- a/nni/nas/tensorflow/base_mutator.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from tensorflow.keras import Model
-
-from .mutables import Mutable, MutableScope, InputChoice
-from .utils import StructuredMutableTreeNode
-
-
-class BaseMutator(Model):
-    def __init__(self, model):
-        super().__init__()
-        self.__dict__['model'] = model
-        self._structured_mutables = self._parse_search_space(self.model)
-
-    def _parse_search_space(self, module, root=None, prefix='', memo=None, nested_detection=None):
-        if memo is None:
-            memo = set()
-        if root is None:
-            root = StructuredMutableTreeNode(None)
-        if module not in memo:
-            memo.add(module)
-            if isinstance(module, Mutable):
-                if nested_detection is not None:
-                    raise RuntimeError('Cannot have nested search space. Error at {} in {}'
-                                       .format(module, nested_detection))
-                module.name = prefix
-                module.set_mutator(self)
-                root = root.add_child(module)
-                if not isinstance(module, MutableScope):
-                    nested_detection = module
-                if isinstance(module, InputChoice):
-                    for k in module.choose_from:
-                        if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]:
-                            raise RuntimeError('"{}" required by "{}" not found in keys that appeared before, and is not NO_KEY.'
-                                               .format(k, module.key))
-            for submodule in module.layers:
-                if not isinstance(submodule, Model):
-                    continue
-                submodule_prefix = prefix + ('.' if prefix else '') + submodule.name
-                self._parse_search_space(submodule, root, submodule_prefix, memo=memo, nested_detection=nested_detection)
-        return root
-
-    @property
-    def mutables(self):
-        return self._structured_mutables
-
-    def undedup_mutables(self):
-        return self._structured_mutables.traverse(deduplicate=False)
-
-    def call(self, *inputs):
-        raise RuntimeError('Call is undefined for mutators.')
-
-    def __setattr__(self, name, value):
-        if name == 'model':
-            raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to "
-                                 "include your network, as it will include all parameters in model into the mutator.")
-        return super().__setattr__(name, value)
-
-    def enter_mutable_scope(self, mutable_scope):
-        pass
-
-    def exit_mutable_scope(self, mutable_scope):
-        pass
-
-    def on_forward_layer_choice(self, mutable, *inputs):
-        raise NotImplementedError
-
-    def on_forward_input_choice(self, mutable, tensor_list):
-        raise NotImplementedError
-
-    def export(self):
-        raise NotImplementedError
diff --git a/nni/nas/tensorflow/mutables.py b/nni/nas/tensorflow/mutables.py
deleted file mode 100644
index 06183a34c..000000000
--- a/nni/nas/tensorflow/mutables.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-from collections import OrderedDict
-
-from tensorflow.keras import Model
-
-from .utils import global_mutable_counting
-
-
-_logger = logging.getLogger(__name__)
-
-
-class Mutable(Model):
-    def __init__(self, key=None):
-        super().__init__()
-        if key is None:
-            self._key = '{}_{}'.format(type(self).__name__, global_mutable_counting())
-        elif isinstance(key, str):
-            self._key = key
-        else:
-            self._key = str(key)
-            _logger.warning('Key "%s" is not string, converted to string.', key)
-        self.init_hook = None
-        self.forward_hook = None
-
-    def __deepcopy__(self, memodict=None):
-        raise NotImplementedError("Deep copy doesn't work for mutables.")
-
-    def set_mutator(self, mutator):
-        if hasattr(self, 'mutator'):
-            raise RuntimeError('`set_mutator is called more than once. '
-                               'Did you parse the search space multiple times? '
-                               'Or did you apply multiple fixed architectures?')
-        self.mutator = mutator
-
-    def call(self, *inputs):
-        raise NotImplementedError('Method `call` of Mutable must be overridden')
-
-    def build(self, input_shape):
-        self._check_built()
-
-    @property
-    def key(self):
-        return self._key
-
-    @property
-    def name(self):
-        return self._name if hasattr(self, '_name') else self._key
-
-    @name.setter
-    def name(self, name):
-        self._name = name
-
-    def _check_built(self):
-        if not hasattr(self, 'mutator'):
-            raise ValueError(
-                "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. "
-                "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` "
-                "so that trainer can locate all your mutables. See NNI docs for more details.".format(self))
-
-    def __repr__(self):
-        return '{} ({})'.format(self.name, self.key)
-
-
-class MutableScope(Mutable):
-    def __call__(self, *args, **kwargs):
-        try:
-            self.mutator.enter_mutable_scope(self)
-            return super().__call__(*args, **kwargs)
-        finally:
-            self.mutator.exit_mutable_scope(self)
-
-
-class LayerChoice(Mutable):
-    def __init__(self, op_candidates, reduction='sum', return_mask=False, key=None):
-        super().__init__(key=key)
-        self.names = []
-        if isinstance(op_candidates, OrderedDict):
-            for name in op_candidates:
-                assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \
-                    "Please don't use a reserved name '{}' for your module.".format(name)
-                self.names.append(name)
-        elif isinstance(op_candidates, list):
-            for i, _ in enumerate(op_candidates):
-                self.names.append(str(i))
-        else:
-            raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates)))
-
-        self.length = len(op_candidates)
-        self.choices = op_candidates
-        self.reduction = reduction
-        self.return_mask = return_mask
-
-    def call(self, *inputs):
-        out, mask = self.mutator.on_forward_layer_choice(self, *inputs)
-        if self.return_mask:
-            return out, mask
-        return out
-
-    def build(self, input_shape):
-        self._check_built()
-        for op in self.choices:
-            op.build(input_shape)
-
-    def __len__(self):
-        return len(self.choices)
-
-
-class InputChoice(Mutable):
-    NO_KEY = ''
-
-    def __init__(self, n_candidates=None, choose_from=None, n_chosen=None, reduction='sum', return_mask=False, key=None):
-        super().__init__(key=key)
-        assert n_candidates is not None or choose_from is not None, \
-                'At least one of `n_candidates` and `choose_from` must be not None.'
-        if choose_from is not None and n_candidates is None:
-            n_candidates = len(choose_from)
-        elif choose_from is None and n_candidates is not None:
-            choose_from = [self.NO_KEY] * n_candidates
-        assert n_candidates == len(choose_from), 'Number of candidates must be equal to the length of `choose_from`.'
-        assert n_candidates > 0, 'Number of candidates must be greater than 0.'
-        assert n_chosen is None or 0 <= n_chosen <= n_candidates, \
-                'Expected selected number must be None or no more than number of candidates.'
-
-        self.n_candidates = n_candidates
-        self.choose_from = choose_from.copy()
-        self.n_chosen = n_chosen
-        self.reduction = reduction
-        self.return_mask = return_mask
-
-    def call(self, optional_inputs):
-        optional_input_list = optional_inputs
-        if isinstance(optional_inputs, dict):
-            optional_input_list = [optional_inputs[tag] for tag in self.choose_from]
-        assert isinstance(optional_input_list, list), \
-                'Optional input list must be a list, not a {}.'.format(type(optional_input_list))
-        assert len(optional_inputs) == self.n_candidates, \
-                'Length of the input list must be equal to number of candidates.'
-        out, mask = self.mutator.on_forward_input_choice(self, optional_input_list)
-        if self.return_mask:
-            return out, mask
-        return out
diff --git a/nni/nas/tensorflow/mutator.py b/nni/nas/tensorflow/mutator.py
deleted file mode 100644
index b0d2aed68..000000000
--- a/nni/nas/tensorflow/mutator.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import tensorflow as tf
-
-from .base_mutator import BaseMutator
-
-
-_logger = logging.getLogger(__name__)
-
-
-class Mutator(BaseMutator):
-    def __init__(self, model):
-        super().__init__(model)
-        self._cache = {}
-
-    def sample_search(self):
-        raise NotImplementedError('Method `sample_search` must be overridden')
-
-    def sample_final(self):
-        raise NotImplementedError('Method `sample_final` must be overriden for exporting')
-
-    def reset(self):
-        self._cache = self.sample_search()
-
-    def export(self):
-        return self.sample_final()
-
-    # TODO: status
-    # TODO: graph
-
-    def on_forward_layer_choice(self, mutable, *inputs):
-        mask = self._get_decision(mutable)
-        assert len(mask) == len(mutable), \
-                'Invalid mask, expected {} to be of length {}.'.format(mask, len(mutable))
-        out = self._select_with_mask(lambda choice: choice(*inputs), mutable.choices, mask)
-        return self._tensor_reduction(mutable.reduction, out), mask
-
-    def on_forward_input_choice(self, mutable, tensor_list):
-        mask = self._get_decision(mutable)
-        assert len(mask) == mutable.n_candidates, \
-                'Invalid mask, expected {} to be of length {}.'.format(mask, mutable.n_candidates)
-        out = self._select_with_mask(lambda tensor: tensor, tensor_list, mask)
-        return self._tensor_reduction(mutable.reduction, out), mask
-
-    def _select_with_mask(self, map_fn, candidates, mask):
-        if mask.dtype.is_bool:
-            out = [map_fn(cand) for cand, m in zip(candidates, mask) if m]
-        elif mask.dtype.is_floating:
-            out = [map_fn(cand) * m for cand, m in zip(candidates, mask) if m]
-        else:
-            raise ValueError('Unrecognized mask, dtype is {}'.format(mask.dtype.name))
-        return out
-
-    def _tensor_reduction(self, reduction_type, tensor_list):
-        if reduction_type == 'none':
-            return tensor_list
-        if not tensor_list:
-            return None
-        if len(tensor_list) == 1:
-            return tensor_list[0]
-        if reduction_type == 'sum':
-            return sum(tensor_list)
-        if reduction_type == 'mean':
-            return sum(tensor_list) / len(tensor_list)
-        if reduction_type == 'concat':
-            image_data_format = tf.keras.backend.image_data_format()
-            if image_data_format == "channels_first":
-                axis = 0
-            else:
-                axis = -1
-            return tf.concat(tensor_list, axis=axis)  # pylint: disable=E1120,E1123
-            # pylint issue #3613
-        raise ValueError('Unrecognized reduction policy: "{}'.format(reduction_type))
-
-    def _get_decision(self, mutable):
-        if mutable.key not in self._cache:
-            raise ValueError('"{}" not found in decision cache.'.format(mutable.key))
-        result = self._cache[mutable.key]
-        _logger.debug('Decision %s: %s', mutable.key, result)
-        return result
diff --git a/nni/nas/tensorflow/utils.py b/nni/nas/tensorflow/utils.py
deleted file mode 100644
index 0cfc6e815..000000000
--- a/nni/nas/tensorflow/utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import tensorflow as tf
-
-_counter = 0
-
-def global_mutable_counting():
-    global _counter
-    _counter += 1
-    return _counter
-
-
-class AverageMeter:
-    def __init__(self, name):
-        self.name = name
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val):
-        self.val = val
-        self.sum += val
-        self.count += 1
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        return '{name} {val:4f} ({avg:4f})'.format(**self.__dict__)
-
-    def summary(self):
-        return '{name}: {avg:4f}'.format(**self.__dict__)
-
-
-class AverageMeterGroup:
-    def __init__(self):
-        self.meters = {}
-
-    def update(self, data):
-        for k, v in data.items():
-            if k not in self.meters:
-                self.meters[k] = AverageMeter(k)
-            self.meters[k].update(v)
-
-    def __str__(self):
-        return '  '.join(str(v) for v in self.meters.values())
-
-    def summary(self):
-        return '  '.join(v.summary() for v in self.meters.values())
-
-
-class StructuredMutableTreeNode:
-    def __init__(self, mutable):
-        self.mutable = mutable
-        self.children = []
-
-    def add_child(self, mutable):
-        self.children.append(StructuredMutableTreeNode(mutable))
-        return self.children[-1]
-
-    def type(self):
-        return type(self.mutable)
-
-    def __iter__(self):
-        return self.traverse()
-
-    def traverse(self, order="pre", deduplicate=True, memo=None):
-        if memo is None:
-            memo = set()
-        assert order in ["pre", "post"]
-        if order == "pre":
-            if self.mutable is not None:
-                if not deduplicate or self.mutable.key not in memo:
-                    memo.add(self.mutable.key)
-                    yield self.mutable
-        for child in self.children:
-            for m in child.traverse(order=order, deduplicate=deduplicate, memo=memo):
-                yield m
-        if order == "post":
-            if self.mutable is not None:
-                if not deduplicate or self.mutable.key not in memo:
-                    memo.add(self.mutable.key)
-                    yield self.mutable
-
-
-def fill_zero_grads(grads, weights):
-    ret = []
-    for grad, weight in zip(grads, weights):
-        if grad is not None:
-            ret.append(grad)
-        else:
-            ret.append(tf.zeros_like(weight))
-    return ret
diff --git a/nni/retiarii/utils.py b/nni/nas/utils/misc.py
similarity index 100%
rename from nni/retiarii/utils.py
rename to nni/nas/utils/misc.py
diff --git a/nni/retiarii/serializer.py b/nni/nas/utils/serializer.py
similarity index 100%
rename from nni/retiarii/serializer.py
rename to nni/nas/utils/serializer.py
diff --git a/nni/retiarii/oneshot/pytorch/enas.py b/nni/retiarii/oneshot/pytorch/enas.py
index 15382589f..75362161c 100644
--- a/nni/retiarii/oneshot/pytorch/enas.py
+++ b/nni/retiarii/oneshot/pytorch/enas.py
@@ -18,148 +18,6 @@ from .utils import AverageMeterGroup, replace_layer_choice, replace_input_choice
 _logger = logging.getLogger(__name__)
 
 
-class StackedLSTMCell(nn.Module):
-    def __init__(self, layers, size, bias):
-        super().__init__()
-        self.lstm_num_layers = layers
-        self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias)
-                                           for _ in range(self.lstm_num_layers)])
-
-    def forward(self, inputs, hidden):
-        prev_h, prev_c = hidden
-        next_h, next_c = [], []
-        for i, m in enumerate(self.lstm_modules):
-            curr_h, curr_c = m(inputs, (prev_h[i], prev_c[i]))
-            next_c.append(curr_c)
-            next_h.append(curr_h)
-            # current implementation only supports batch size equals 1,
-            # but the algorithm does not necessarily have this limitation
-            inputs = curr_h[-1].view(1, -1)
-        return next_h, next_c
-
-
-class ReinforceField:
-    """
-    A field with ``name``, with ``total`` choices. ``choose_one`` is true if one and only one is meant to be
-    selected. Otherwise, any number of choices can be chosen.
-    """
-
-    def __init__(self, name, total, choose_one):
-        self.name = name
-        self.total = total
-        self.choose_one = choose_one
-
-    def __repr__(self):
-        return f'ReinforceField(name={self.name}, total={self.total}, choose_one={self.choose_one})'
-
-
-class ReinforceController(nn.Module):
-    """
-    A controller that mutates the graph with RL.
-
-    Parameters
-    ----------
-    fields : list of ReinforceField
-        List of fields to choose.
-    lstm_size : int
-        Controller LSTM hidden units.
-    lstm_num_layers : int
-        Number of layers for stacked LSTM.
-    tanh_constant : float
-        Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``.
-    skip_target : float
-        Target probability that skipconnect (chosen by InputChoice) will appear.
-        If the chosen number of inputs is away from the ``skip_connect``, there will be
-        a sample skip penalty which is a KL divergence added.
-    temperature : float
-        Temperature constant that divides the logits.
-    entropy_reduction : str
-        Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced.
-    """
-
-    def __init__(self, fields, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5,
-                 skip_target=0.4, temperature=None, entropy_reduction='sum'):
-        super(ReinforceController, self).__init__()
-        self.fields = fields
-        self.lstm_size = lstm_size
-        self.lstm_num_layers = lstm_num_layers
-        self.tanh_constant = tanh_constant
-        self.temperature = temperature
-        self.skip_target = skip_target
-
-        self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False)
-        self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
-        self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
-        self.v_attn = nn.Linear(self.lstm_size, 1, bias=False)
-        self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1)
-        self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]),  # pylint: disable=not-callable
-                                         requires_grad=False)
-        assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.'
-        self.entropy_reduction = torch.sum if entropy_reduction == 'sum' else torch.mean
-        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
-        self.soft = nn.ModuleDict({
-            field.name: nn.Linear(self.lstm_size, field.total, bias=False) for field in fields
-        })
-        self.embedding = nn.ModuleDict({
-            field.name: nn.Embedding(field.total, self.lstm_size) for field in fields
-        })
-
-    def resample(self):
-        self._initialize()
-        result = dict()
-        for field in self.fields:
-            result[field.name] = self._sample_single(field)
-        return result
-
-    def _initialize(self):
-        self._inputs = self.g_emb.data
-        self._c = [torch.zeros((1, self.lstm_size),
-                               dtype=self._inputs.dtype,
-                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
-        self._h = [torch.zeros((1, self.lstm_size),
-                               dtype=self._inputs.dtype,
-                               device=self._inputs.device) for _ in range(self.lstm_num_layers)]
-        self.sample_log_prob: torch.Tensor = cast(torch.Tensor, 0)
-        self.sample_entropy: torch.Tensor = cast(torch.Tensor, 0)
-        self.sample_skip_penalty: torch.Tensor = cast(torch.Tensor, 0)
-
-    def _lstm_next_step(self):
-        self._h, self._c = self.lstm(self._inputs, (self._h, self._c))
-
-    def _sample_single(self, field):
-        self._lstm_next_step()
-        logit = self.soft[field.name](self._h[-1])
-        if self.temperature is not None:
-            logit /= self.temperature
-        if self.tanh_constant is not None:
-            logit = self.tanh_constant * torch.tanh(logit)
-        if field.choose_one:
-            sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
-            log_prob = self.cross_entropy_loss(logit, sampled)
-            self._inputs = self.embedding[field.name](sampled)
-        else:
-            logit = logit.view(-1, 1)
-            logit = torch.cat([-logit, logit], 1)  # pylint: disable=invalid-unary-operand-type
-            sampled = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
-            skip_prob = torch.sigmoid(logit)
-            kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets))
-            self.sample_skip_penalty += kl
-            log_prob = self.cross_entropy_loss(logit, sampled)
-            sampled = sampled.nonzero().view(-1)
-            if sampled.sum().item():
-                self._inputs = (torch.sum(self.embedding[field.name](sampled.view(-1)), 0) / (1. + torch.sum(sampled))).unsqueeze(0)
-            else:
-                self._inputs = torch.zeros(1, self.lstm_size, device=self.embedding[field.name].weight.device)  # type: ignore
-
-        sampled = sampled.detach().cpu().numpy().tolist()
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = (log_prob * torch.exp(-log_prob)).detach()  # pylint: disable=invalid-unary-operand-type
-        self.sample_entropy += self.entropy_reduction(entropy)
-        if len(sampled) == 1:
-            sampled = sampled[0]
-        return sampled
-
-
 class EnasTrainer(BaseOneShotTrainer):
     """
     ENAS trainer.