From 3c490f9cf45e6bf7841932cba3d8f3623a04c461 Mon Sep 17 00:00:00 2001 From: andyG <135115931+Andy666G@users.noreply.github.com> Date: Fri, 2 Aug 2024 04:28:55 +0800 Subject: [PATCH] Use accelerator to replace cuda in setup and runner (#5769) Use accelerator apis to select device in setup.py and set visible devices env in runner.py --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/launcher/runner.py | 13 +++++++------ setup.py | 7 ++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py index 12dd629b1..f140d73ee 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py @@ -403,18 +403,19 @@ def main(args=None): resource_pool = fetch_hostfile(args.hostfile) - # respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters - cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "") - if not resource_pool and len(cuda_visible_devices): - detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}" + # respect VISIBLE_DEVICES for a single node and no explicit resource filters + visible_devices_env = get_accelerator().visible_devices_envs()[0] + visible_devices = os.environ.get(visible_devices_env, "") + if not resource_pool and len(visible_devices): + detected_str = f"Detected VISIBLE_DEVICES={visible_devices}" if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0: print( f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed." ) else: - args.include = f"localhost:{cuda_visible_devices}" + args.include = f"localhost:{visible_devices}" print(f"{detected_str}: setting --include={args.include}") - del os.environ["CUDA_VISIBLE_DEVICES"] + del os.environ[visible_devices_env] if args.num_nodes >= 0 or args.num_gpus >= 0: if args.include != "" or args.exclude != "": diff --git a/setup.py b/setup.py index 183d42907..2b7555361 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,8 @@ from op_builder import get_default_compute_capabilities, OpBuilder from op_builder.all_ops import ALL_OPS, accelerator_name from op_builder.builder import installed_cuda_version +from accelerator import get_accelerator + # Fetch rocm state. is_rocm_pytorch = OpBuilder.is_rocm_pytorch() rocm_version = OpBuilder.installed_rocm_version() @@ -91,7 +93,7 @@ extras_require = { } # Add specific cupy version to both onebit extension variants. -if torch_available and torch.cuda.is_available(): +if torch_available and get_accelerator().device_name() == 'cuda': cupy = None if is_rocm_pytorch: rocm_major, rocm_minor = rocm_version @@ -120,7 +122,6 @@ cmdclass = {} # For any pre-installed ops force disable ninja. if torch_available: - from accelerator import get_accelerator use_ninja = is_env_set("DS_ENABLE_NINJA") cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja) @@ -131,7 +132,7 @@ else: TORCH_MAJOR = "0" TORCH_MINOR = "0" -if torch_available and not torch.cuda.is_available(): +if torch_available and not get_accelerator().device_name() == 'cuda': # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486. print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only " "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "