diff --git a/.vscode/launch.json b/.vscode/launch.json index d1dff8a0..1b805858 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -209,7 +209,7 @@ "request": "launch", "program": "${cwd}/scripts/supergraph/main.py", "console": "integratedTerminal", - "args": ["--no-search", "--algos", "manual", "--datasets", "imagenet"] + "args": ["--no-search", "--algos", "manual"] }, { "name": "Resnet-Full", diff --git a/archai/common/apex_utils.py b/archai/common/apex_utils.py index 5a5e79a7..b3a1dd8c 100644 --- a/archai/common/apex_utils.py +++ b/archai/common/apex_utils.py @@ -172,7 +172,7 @@ class ApexUtils: def is_mixed(self)->bool: return self._enabled and self._mixed_prec_enabled def is_dist(self)->bool: - return self._enabled and self._distributed_enabled + return self._enabled and self._distributed_enabled and self.world_size > 1 def is_master(self)->bool: return self.global_rank == 0 def is_ray(self)->bool: diff --git a/archai/supergraph/datasets/providers/imagenet_provider.py b/archai/supergraph/datasets/providers/imagenet_provider.py index 64fb66eb..1bb3b034 100644 --- a/archai/supergraph/datasets/providers/imagenet_provider.py +++ b/archai/supergraph/datasets/providers/imagenet_provider.py @@ -4,7 +4,6 @@ import os from overrides import overrides -from PIL import Image from torchvision import datasets from torchvision.transforms import transforms @@ -59,7 +58,7 @@ class ImagenetProvider(DatasetProvider): transform_train = transforms.Compose([ transforms.RandomResizedCrop(224, scale=(0.08, 1.0), # TODO: these two params are normally not specified - interpolation=Image.BICUBIC), + interpolation=transforms.InterpolationMode.BICUBIC), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, diff --git a/archai/supergraph/utils/trainer.py b/archai/supergraph/utils/trainer.py index d7ff8be8..d7797533 100644 --- a/archai/supergraph/utils/trainer.py +++ b/archai/supergraph/utils/trainer.py @@ -294,7 +294,8 @@ class Trainer(EnforceOverrides): loss_sum += loss_c.item() * len(logits_c) loss_count += len(logits_c) - logits_chunks.append(logits_c.detach().cpu()) # pyright: ignore[reportGeneralTypeIssues] + # TODO: cannot place on CPU if it was half precision but should we somehow? + logits_chunks.append(logits_c.detach()) # pyright: ignore[reportGeneralTypeIssues] # TODO: original darts clips alphas as well but pt.darts doesn't self._apex.clip_grad(self._grad_clip, self.model, self._multi_optim) @@ -304,7 +305,8 @@ class Trainer(EnforceOverrides): # TODO: we possibly need to sync so all replicas are upto date self._apex.sync_devices() - self.post_step(x, y, + # TODO: we need to put y on GPU because logits are on GPU. Is this good idea from GPU mem perspective? + self.post_step(x, y.to(self.get_device(), non_blocking=True), ml_utils.join_chunks(logits_chunks), torch.tensor(loss_sum/loss_count), steps) diff --git a/scripts/create_dataroot_symlink.bat b/scripts/create_dataroot_symlink.bat new file mode 100644 index 00000000..c351d340 --- /dev/null +++ b/scripts/create_dataroot_symlink.bat @@ -0,0 +1,2 @@ +REM Creates symbolic link to datasets folder, so that Archai can find the datasets +mklink /j %USERPROFILE%\dataroot E:\datasets \ No newline at end of file