зеркало из https://github.com/microsoft/DeepSpeed.git
Use accelerator to replace cuda in setup and runner (#5769)
Use accelerator apis to select device in setup.py and set visible devices env in runner.py --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
This commit is contained in:
Родитель
249c1db2fb
Коммит
3c490f9cf4
|
@ -403,18 +403,19 @@ def main(args=None):
|
|||
|
||||
resource_pool = fetch_hostfile(args.hostfile)
|
||||
|
||||
# respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
|
||||
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
||||
if not resource_pool and len(cuda_visible_devices):
|
||||
detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
|
||||
# respect VISIBLE_DEVICES for a single node and no explicit resource filters
|
||||
visible_devices_env = get_accelerator().visible_devices_envs()[0]
|
||||
visible_devices = os.environ.get(visible_devices_env, "")
|
||||
if not resource_pool and len(visible_devices):
|
||||
detected_str = f"Detected VISIBLE_DEVICES={visible_devices}"
|
||||
if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
|
||||
print(
|
||||
f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
|
||||
)
|
||||
else:
|
||||
args.include = f"localhost:{cuda_visible_devices}"
|
||||
args.include = f"localhost:{visible_devices}"
|
||||
print(f"{detected_str}: setting --include={args.include}")
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
del os.environ[visible_devices_env]
|
||||
|
||||
if args.num_nodes >= 0 or args.num_gpus >= 0:
|
||||
if args.include != "" or args.exclude != "":
|
||||
|
|
7
setup.py
7
setup.py
|
@ -40,6 +40,8 @@ from op_builder import get_default_compute_capabilities, OpBuilder
|
|||
from op_builder.all_ops import ALL_OPS, accelerator_name
|
||||
from op_builder.builder import installed_cuda_version
|
||||
|
||||
from accelerator import get_accelerator
|
||||
|
||||
# Fetch rocm state.
|
||||
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
|
||||
rocm_version = OpBuilder.installed_rocm_version()
|
||||
|
@ -91,7 +93,7 @@ extras_require = {
|
|||
}
|
||||
|
||||
# Add specific cupy version to both onebit extension variants.
|
||||
if torch_available and torch.cuda.is_available():
|
||||
if torch_available and get_accelerator().device_name() == 'cuda':
|
||||
cupy = None
|
||||
if is_rocm_pytorch:
|
||||
rocm_major, rocm_minor = rocm_version
|
||||
|
@ -120,7 +122,6 @@ cmdclass = {}
|
|||
|
||||
# For any pre-installed ops force disable ninja.
|
||||
if torch_available:
|
||||
from accelerator import get_accelerator
|
||||
use_ninja = is_env_set("DS_ENABLE_NINJA")
|
||||
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja)
|
||||
|
||||
|
@ -131,7 +132,7 @@ else:
|
|||
TORCH_MAJOR = "0"
|
||||
TORCH_MINOR = "0"
|
||||
|
||||
if torch_available and not torch.cuda.is_available():
|
||||
if torch_available and not get_accelerator().device_name() == 'cuda':
|
||||
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
|
||||
print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
|
||||
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
|
||||
|
|
Загрузка…
Ссылка в новой задаче