
775 строки
30 KiB
Исходник Обычный вид История

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import os
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
import sys
import time
import importlib
from pathlib import Path
import subprocess
import shlex
import shutil
import tempfile
import distutils.ccompiler
import distutils.log
import distutils.sysconfig
from distutils.errors import CompileError, LinkError
from abc import ABC, abstractmethod
from typing import List
YELLOW = '\033[93m'
END = '\033[0m'
DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
import torch
except ImportError:
print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
class MissingCUDAException(Exception):
class CUDAMismatchException(Exception):
def installed_cuda_version(name=""):
import torch.utils.cpp_extension
cuda_home = torch.utils.cpp_extension.CUDA_HOME
if cuda_home is None:
raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
# Ensure there is not a cuda version mismatch between torch and nvcc compiler
output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
output_split = output.split()
release_idx = output_split.index("release")
release = output_split[release_idx + 1].replace(',', '').split(".")
# Ignore patch versions, only look at major + minor
cuda_major, cuda_minor = release[:2]
return int(cuda_major), int(cuda_minor)
def get_default_compute_capabilities():
import torch.utils.cpp_extension
if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
2021-01-07 05:12:39 +03:00
if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
# Special treatment of CUDA 11.0 because compute_86 is not supported.
compute_caps += ";8.0"
compute_caps += ";8.0;8.6"
return compute_caps
# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
# to build deepspeed and system-wide installed cuda 11.2
cuda_minor_mismatch_ok = {
10: ["10.0", "10.1", "10.2"],
11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
12: ["12.0", "12.1", "12.2", "12.3"],
def assert_no_cuda_mismatch(name=""):
cuda_major, cuda_minor = installed_cuda_version(name)
sys_cuda_version = f'{cuda_major}.{cuda_minor}'
torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
# This is a show-stopping error, should probably not proceed past this
if sys_cuda_version != torch_cuda_version:
if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
print(f"Installed CUDA version {sys_cuda_version} does not match the "
f"version torch was compiled with {torch.version.cuda} "
"but since the APIs are compatible, accepting this combination")
return True
elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
f"version torch was compiled with {torch.version.cuda}."
"Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
return True
raise CUDAMismatchException(
f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
f"version torch was compiled with {torch.version.cuda}, unable to compile "
"cuda/cpp extensions without a matching cuda version.")
return True
class OpBuilder(ABC):
_rocm_version = None
_is_rocm_pytorch = None
_is_sycl_enabled = None
_loaded_ops = {}
def __init__(self, name): = name
self.jit_mode = False
self.build_for_cpu = False
self.enable_bf16 = False
self.error_log = None
def absolute_name(self):
Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
will be installed as something like: deepspeed/ops/adam/
def sources(self):
Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
def hipify_extension(self):
def sycl_extension(self):
def validate_torch_version(torch_info):
install_torch_version = torch_info['version']
current_torch_version = ".".join(torch.__version__.split('.')[:2])
if install_torch_version != current_torch_version:
raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. "
f"Please re-install DeepSpeed or switch torch versions. "
f"Install torch version={install_torch_version}, "
f"Runtime torch version={current_torch_version}")
def validate_torch_op_version(torch_info):
if not OpBuilder.is_rocm_pytorch():
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
install_cuda_version = torch_info['cuda_version']
if install_cuda_version != current_cuda_version:
raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. "
f"Please re-install DeepSpeed or switch torch versions. "
f"Install CUDA version={install_cuda_version}, "
f"Runtime CUDA version={current_cuda_version}")
current_hip_version = ".".join(torch.version.hip.split('.')[:2])
install_hip_version = torch_info['hip_version']
if install_hip_version != current_hip_version:
raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. "
f"Please re-install DeepSpeed or switch torch versions. "
f"Install HIP version={install_hip_version}, "
f"Runtime HIP version={current_hip_version}")
def is_rocm_pytorch():
if OpBuilder._is_rocm_pytorch is not None:
return OpBuilder._is_rocm_pytorch
_is_rocm_pytorch = False
import torch
except ImportError:
if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
_is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
if _is_rocm_pytorch:
from torch.utils.cpp_extension import ROCM_HOME
_is_rocm_pytorch = ROCM_HOME is not None
OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
return OpBuilder._is_rocm_pytorch
def is_sycl_enabled():
if OpBuilder._is_sycl_enabled is not None:
return OpBuilder._is_sycl_enabled
_is_sycl_enabled = False
result =["c2s", "--version"], capture_output=True)
_is_sycl_enabled = True
OpBuilder._is_sycl_enabled = _is_sycl_enabled
return OpBuilder._is_sycl_enabled
def installed_rocm_version():
if OpBuilder._rocm_version:
return OpBuilder._rocm_version
if OpBuilder.is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME
rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version-dev")
if rocm_ver_file.is_file():
with open(rocm_ver_file, 'r') as file:
elif "rocm" in torch.__version__:
ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
assert False, "Could not detect ROCm version"
assert ROCM_VERSION_DEV_RAW != "", "Could not detect ROCm version"
OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
return OpBuilder._rocm_version
def include_paths(self):
Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
return []
def nvcc_args(self):
Returns optional list of compiler flags to forward to nvcc when building CUDA sources
return []
def cxx_args(self):
Returns optional list of compiler flags to forward to the build
return []
def is_compatible(self, verbose=True):
Check if all non-python dependencies are satisfied to build this op
return True
2021-03-08 23:54:54 +03:00
def extra_ldflags(self):
return []
def has_function(self, funcname, libraries, verbose=False):
Test for existence of a function within a tuple of libraries.
This is used as a smoke test to check whether a certain library is available.
As a test, this creates a simple C program that calls the specified function,
and then distutils is used to compile that program and link it with the specified libraries.
Returns True if both the compile and link are successful, False otherwise.
tempdir = None # we create a temporary directory to hold various files
filestderr = None # handle to open file to which we redirect stderr
oldstderr = None # file descriptor for stderr
# Echo compile and link commands that are used.
if verbose:
# Create a compiler object.
compiler = distutils.ccompiler.new_compiler(verbose=verbose)
# Configure compiler and linker to build according to Python install.
# Create a temporary directory to hold test files.
tempdir = tempfile.mkdtemp()
# Define a simple C program that calls the function in question
prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
# Write the test program to a file.
filename = os.path.join(tempdir, 'test.c')
with open(filename, 'w') as f:
# Redirect stderr file descriptor to a file to silence compile/link warnings.
if not verbose:
filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
oldstderr = os.dup(sys.stderr.fileno())
os.dup2(filestderr.fileno(), sys.stderr.fileno())
# Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
# Otherwise, a local directory will be used instead of tempdir
drive, driveless_filename = os.path.splitdrive(filename)
root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
output_dir = os.path.join(drive, root_dir)
# Attempt to compile the C program into an object file.
cflags = shlex.split(os.environ.get('CFLAGS', ""))
objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
# Attempt to link the object file into an executable.
# Be sure to tack on any libraries that have been specified.
ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
os.path.join(tempdir, 'a.out'),
# Compile and link succeeded
return True
except CompileError:
return False
except LinkError:
return False
return False
# Restore stderr file descriptor and close the stderr redirect file.
if oldstderr is not None:
os.dup2(oldstderr, sys.stderr.fileno())
if filestderr is not None:
# Delete the temporary directory holding the test program and stderr files.
if tempdir is not None:
def strip_empty_entries(self, args):
Drop any empty strings from the list of compile and link flags
return [x for x in args if len(x) > 0]
def cpu_arch(self):
from cpuinfo import get_cpu_info
except ImportError as e:
cpu_info = self._backup_cpuinfo()
if cpu_info is None:
return "-march=native"
cpu_info = get_cpu_info()
except Exception as e:
self.warning(f"{} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
"falling back to `lscpu` to get this information.")
cpu_info = self._backup_cpuinfo()
if cpu_info is None:
return "-march=native"
if cpu_info['arch'].startswith('PPC_'):
# gcc does not provide -march on PowerPC, use -mcpu instead
return '-mcpu=native'
return '-march=native'
def is_cuda_enable(self):
return '-D__ENABLE_CUDA__'
except MissingCUDAException:
print(f"{WARNING} {} cuda is missing or is incompatible with installed torch, "
"only cpu ops can be compiled!")
return '-D__DISABLE_CUDA__'
return '-D__DISABLE_CUDA__'
def _backup_cpuinfo(self):
# Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
if not self.command_exists('lscpu'):
self.warning(f"{} attempted to query 'lscpu' after failing to use py-cpuinfo "
"to detect the CPU architecture. 'lscpu' does not appear to exist on "
"your system, will fall back to use -march=native and non-vectorized execution.")
return None
result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower()
cpu_info = {}
cpu_info['arch'] = None
cpu_info['flags'] = ""
if 'genuineintel' in result or 'authenticamd' in result:
cpu_info['arch'] = 'X86_64'
if 'avx512' in result:
cpu_info['flags'] += 'avx512,'
elif 'avx512f' in result:
cpu_info['flags'] += 'avx512f,'
if 'avx2' in result:
cpu_info['flags'] += 'avx2'
elif 'ppc64le' in result:
cpu_info['arch'] = "PPC_"
return cpu_info
2021-03-08 23:54:54 +03:00
def simd_width(self):
from cpuinfo import get_cpu_info
except ImportError as e:
cpu_info = self._backup_cpuinfo()
if cpu_info is None:
return '-D__SCALAR__'
2021-03-08 23:54:54 +03:00
cpu_info = get_cpu_info()
except Exception as e:
self.warning(f"{} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
"falling back to `lscpu` to get this information.")
cpu_info = self._backup_cpuinfo()
if cpu_info is None:
return '-D__SCALAR__'
if cpu_info['arch'] == 'X86_64':
if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
2021-03-08 23:54:54 +03:00
return '-D__AVX512__'
elif 'avx2' in cpu_info['flags']:
return '-D__AVX256__'
return '-D__SCALAR__'
2021-03-08 23:54:54 +03:00
def command_exists(self, cmd):
if '|' in cmd:
cmds = cmd.split("|")
cmds = [cmd]
valid = False
for cmd in cmds:
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
valid = valid or result.wait() == 0
if not valid and len(cmds) > 1:
print(f"{WARNING} {} requires one of the following commands '{cmds}', but it does not exist!")
elif not valid and len(cmds) == 1:
print(f"{WARNING} {} requires the '{cmd}' command, but it does not exist!")
return valid
def warning(self, msg):
self.error_log = f"{msg}"
print(f"{WARNING} {msg}")
def deepspeed_src_path(self, code_path):
if os.path.isabs(code_path):
return code_path
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
def builder(self):
from torch.utils.cpp_extension import CppExtension
include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
return CppExtension(name=self.absolute_name(),
extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
def load(self, verbose=True):
if in __class__._loaded_ops:
return __class__._loaded_ops[]
from deepspeed.git_version_info import installed_ops, torch_info
[CPU] Support Intel CPU inference (#3041) * add fallback path for kernels used in megatron * temporary numactl WA for SPR 56core * adapt core allocation according to number of ranks * add switch to turn on numactl * detect number of cores on the system * allow select a subset of the cores on the system to bind * remove unneeded changes * add ccl backend * change nccl to ccl * remove unused code * add comm/ccl to ops * initial ccl comm support * first broadcast case passed * add CCL_Backend to DeepSpeed * support comm timer for CPU * support barrier for comm backend * support specify master address from deepspeed command line * support pytorch 2.0 * remove 'block' from api * Tweak for debug Signed-off-by: Cao, Zhong Z <> * Remove unecessary directory Signed-off-by: Cao, Zhong Z <> * Add bf16 kernel support for inference * Add temporary torch implement for cpu inference * Add softmax ops cpu fallback for inference * bind cores to numa domain as well * merge latest change in gma/numactl * initial bf16 kernel support with fallback path * initial fallback path for bloom kernel injection * fix softmax attn mask * check KMP_AFFINITY to avoid conflict with numactl * New CCLBackend which utilize TorchBackend for initialization * rollback last change because there is result error * fix bloom injection policy TP could not work issue. injection_policy={BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")} * Use TorchBackend to initialize CCLBackend, make behavior consistent * remove comm under deepspeed/ops * add license header * code clean up * fix format issue * remove magic number in main address * add caching support but not turn on by default * change name of inference_cuda_module to inference_module * Check for is_synchronized_device in accelerator before get Event * fix typo * Fix fallback path of softmax kernel on CUDA device for BF16 data type, because CUDA tril does not support BF16 datatype, enforce fp32 data type * add cpu backend files * change CPU_Accelerator op_builder_dir * remove cpu_kernel_path * using CPU_Accelerator on non-cuda device * fix deepspeed.op_builder => deepspeed.ops.op_builder * add alias for num_gpus: num_accelerators * allow loading cpu_builder in build stage * Assume cuda available if torch not installed * add oneccl_binding_pt to requirements * move oneccl-binding-pt to seperate requiremetns-cpu.txt * add missing file * use dependency_links in setuptools.setup() call for additional dependency links * install oneccl_bind_pt in workflows * change oneccl_bind_pt's version from 1.13 to 2.0 * use intel_exention_for_pytorch as indicator that CPU_Accelerator should be used * Add indicator for Accelerator used * change foo.c to foo.cpp * exclude 'cpu' directory in CUDA op builder reflection * add a cpu-inference workflow * run cpu-inference workflow on self-hosted instance * change cpu runs-on node to v100 node * print out python version in workflow * add verbose in pip command to understand oneccl_bind_pt install issue * update cpu-inference workflow * add a stage to detect instance instruction sets * add back bf16 support for CPU inference * enable autoTP for bloom Signed-off-by: Wang, Yi A <> * update workflow to detect cpu instruction sets * temporary WA for Intel Extension for PyTorch AVX2 instructioon set detection * change cpu-inference workflow machine to ubuntu-20.04 * add sharded checkpoint loading for AutoTP path to reduce the peak memory in initialization stage Signed-off-by: Wang, Yi A <> * enable policy for llama * use a special build ipex to test avx2 detection fix * fix format * fix test fail issue Signed-off-by: Wang, Yi A <> * fix gptj sharded checkpoint loading problem Signed-off-by: Wang, Yi A <> * return a not implemented build in get_op_builder in cpu_backend * support cpu device in tests * use cpuinfo to extract number of CPUs * use ~/tmp as transfomer cache rather than /blob/ * Add support for mpich launcher with prefer_deepspeed_comm * add missing modification in accelerator * enable IMPI launcher * remove unused file and fix formatting * clean up ccl.cpp * Less confusing error message when certin op builder are not implemented * Fix license header * Add license header * add license headers * add license header * fix cuda specific code in test * update CPU workflow * use numactl to bind to core * allow bind_cores_to_rank in multi-node impi runner * fix format error * Remove InferenceBuilder * fix format error in * check whether op is in installed ops in * allow override accelerator with DS_ACCELERATOR='cuda','cpu' or 'xpu' * lazy init class_dict in CUDA_Accelerator to avoid cyclic initialization of CUDA_Accelerator * put short path in the beginning in * device_count return number of NUMA nodes * fix typo * install numactl in cpu workflow * Follow comments * Better implementation of device_count() and current_device() * remove dependency_link for Intel Extension for DeepSpeed * use check is_synchronized_device in timer only once * remove env mapping WA in cpu_accelerator * fix duplicate definition * fix format error * refine ccl backend selection * move comments to the right place * remove prefer_deepspeed_comm, use CCLBackend by default * refractor fallback path * Fix execution failure in kernel injection path * do not refractory kernel injection fallback path in residual_add because it contains function call with side-effect * guard residual_add fallback path with environ DS_KI_FALLBACK=True * fix format error * add test for allreduce on CPU workflow * fix format error * Fallback to TorchBackend if CCLBackend kernel are not implemented * Update Intel Extension for Pytorch installation link * Don't specify version number of Intel Extension for PyTorch * install oneCCL for CCLBackend * fix link path for CPU comm kernels * fix source oneCCL environment * source oneCCL env before run UT * Give more specific instruction when CCL_ROOT not defined --------- Signed-off-by: Cao, Zhong Z <> Signed-off-by: Wang, Yi A <> Co-authored-by: sdp <> Co-authored-by: Cao, Zhong Z <> Co-authored-by: Zhenhuan Chen <> Co-authored-by: baodii <> Co-authored-by: Wang, Yi A <> Co-authored-by: jianan-gu <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Logan Adams <>
2023-05-16 18:59:22 +03:00
if installed_ops.get(, False):
# Ensure the op we're about to load was compiled with the same
# torch/cuda versions we are currently using at runtime.
if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
op_module = importlib.import_module(self.absolute_name())
__class__._loaded_ops[] = op_module
return op_module
return self.jit_load(verbose)
def jit_load(self, verbose=True):
if not self.is_compatible(verbose):
raise RuntimeError(
f"Unable to JIT load the {} op due to it not being compatible due to hardware/software issue. {self.error_log}"
import ninja # noqa: F401 # type: ignore
except ImportError:
raise RuntimeError(f"Unable to JIT load the {} op due to ninja not being installed.")
if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
self.build_for_cpu = not torch.cuda.is_available()
self.jit_mode = True
from torch.utils.cpp_extension import load
start_build = time.time()
Resolve any '..' in the file paths using os.path.abspath() (#4709) This PR is to resolve any '..' in the file paths like below using os.path.abspath() ``` sources: ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/core_ops.cpp', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/bias_activations/', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_layer_norm/', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_rms_norm/', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/gated_activations/'] extra_include_paths: ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/includes', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/bias_activations', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/blas_kernels', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_layer_norm', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/cuda_rms_norm', '/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed/ops/../inference/v2/kernels/core_ops/gated_activations'] ``` It fixes the hipify errors that occur during JIT build of 'inference_core_ops' extension due to ".." prefix in the paths, cc @jithunnair-amd Co-authored-by: Logan Adams <> Co-authored-by: Olatunji Ruwase <>
2023-12-05 21:42:34 +03:00
sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
# Torch will try and apply whatever CCs are in the arch list at compile time,
# we have already set the intended targets ourselves we know that will be
# needed at runtime. This prevents CC collisions such as multiple __half
# implementations. Stash arch list to reset after build.
torch_arch_list = None
if "TORCH_CUDA_ARCH_LIST" in os.environ:
torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
nvcc_args = self.strip_empty_entries(self.nvcc_args())
cxx_args = self.strip_empty_entries(self.cxx_args())
if isinstance(self, CUDAOpBuilder):
if not self.build_for_cpu and self.enable_bf16:
2023-10-26 20:37:13 +03:00
if self.is_rocm_pytorch():
op_module = load(,
build_duration = time.time() - start_build
if verbose:
print(f"Time to load {} op: {build_duration} seconds")
# Reset arch list so we are not silently removing it for other possible use cases
if torch_arch_list:
os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
__class__._loaded_ops[] = op_module
return op_module
class CUDAOpBuilder(OpBuilder):
def compute_capability_args(self, cross_compile_archs=None):
Returns nvcc compute capability compile flags.
1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
2. If neither is set default compute capabilities will be used
3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
- `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
- `cross_compile_archs` uses ; separator.
ccs = []
if self.jit_mode:
# Compile for underlying architectures since we know those at runtime
for i in range(torch.cuda.device_count()):
CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
cc = f"{CC_MAJOR}.{CC_MINOR}"
if cc not in ccs:
ccs = sorted(ccs)
ccs[-1] += '+PTX'
# Cross-compile mode, compile for various architectures
# env override takes priority
cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
if cross_compile_archs_env is not None:
if cross_compile_archs is not None:
f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
if cross_compile_archs is None:
cross_compile_archs = get_default_compute_capabilities()
ccs = cross_compile_archs.split(';')
ccs = self.filter_ccs(ccs)
if len(ccs) == 0:
raise RuntimeError(
f"Unable to load {} op due to no compute capabilities remaining after filtering")
args = []
self.enable_bf16 = True
for cc in ccs:
num = cc[0] + cc[2]
if cc.endswith('+PTX'):
if int(cc[0]) <= 7:
self.enable_bf16 = False
return args
def filter_ccs(self, ccs: List[str]):
Prune any compute capabilities that are not compatible with the builder. Should log
which CCs have been pruned.
return ccs
def version_dependent_macros(self):
# Fix from apex that might be relevant for us as well, related to
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
version_ge_1_3 = ['-DVERSION_GE_1_3']
version_ge_1_5 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
def is_compatible(self, verbose=True):
return super().is_compatible(verbose)
def builder(self):
if not self.is_rocm_pytorch():
self.build_for_cpu = False
except MissingCUDAException:
self.build_for_cpu = True
if self.build_for_cpu:
from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
{'cxx': self.strip_empty_entries(self.cxx_args()), \
'nvcc': self.strip_empty_entries(self.nvcc_args())}
if not self.build_for_cpu and self.enable_bf16:
if self.is_rocm_pytorch():
cuda_ext = ExtensionBuilder(name=self.absolute_name(),
if self.is_rocm_pytorch():
# hip converts paths to absolute, this converts back to relative
sources = cuda_ext.sources
curr_file = Path(__file__).parent.parent # ds root
for i in range(len(sources)):
src = Path(sources[i])
if src.is_absolute():
sources[i] = str(src.relative_to(curr_file))
sources[i] = str(src)
cuda_ext.sources = sources
return cuda_ext
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
def hipify_extension(self):
if self.is_rocm_pytorch():
from torch.utils.hipify import hipify_python
includes=[os.path.join(os.getcwd(), '*')],
extra_files=[os.path.abspath(s) for s in self.sources()],
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
def cxx_args(self):
if sys.platform == "win32":
return ['-O2']
return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
def nvcc_args(self):
if self.build_for_cpu:
return []
args = ['-O3']
if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
args += [
nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
if nvcc_threads <= 0:
raise ValueError("")
except ValueError:
nvcc_threads = min(os.cpu_count(), 8)
cuda_major, _ = installed_cuda_version()
args += [
'-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
'-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', f'--threads={nvcc_threads}'
if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
args += self.compute_capability_args()
return args
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
def libraries_args(self):
if self.build_for_cpu:
return []
Quantization + inference release (#1091) Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Elton Zheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: eltonzheng <> Co-authored-by: Arash Ashari <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: Reza Yazdani <> Co-authored-by: niumanar <> Co-authored-by: eltonzheng <> Co-authored-by: Reza Yazdani <> Co-authored-by: Shaden Smith <> Co-authored-by: Reza Yazdani <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <> Co-authored-by: Jeff Rasley <> Co-authored-by: eltonzheng <> Co-authored-by: Shaden Smith <> Co-authored-by: Arash Ashari <> Co-authored-by: Olatunji Ruwase <> Co-authored-by: niumanar <>
2021-05-24 11:10:39 +03:00
if sys.platform == "win32":
return ['cublas', 'curand']
return []
class TorchCPUOpBuilder(CUDAOpBuilder):
def extra_ldflags(self):
if self.build_for_cpu:
return ['-fopenmp']
if not self.is_rocm_pytorch():
return ['-lcurand']
return []
def cxx_args(self):
import torch
args = []
if not self.build_for_cpu:
if not self.is_rocm_pytorch():
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
if not os.path.exists(CUDA_LIB64):
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
args += super().cxx_args()
args += [
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()
CUDA_ENABLE = self.is_cuda_enable()
args += [
return args