2021-03-08 23:54:54 +03:00
|
|
|
"""
|
|
|
|
Copyright 2020 The Microsoft DeepSpeed Team
|
|
|
|
"""
|
2020-11-12 22:51:38 +03:00
|
|
|
import os
|
2021-05-24 11:10:39 +03:00
|
|
|
import sys
|
2020-11-12 22:51:38 +03:00
|
|
|
import time
|
|
|
|
import importlib
|
|
|
|
from pathlib import Path
|
|
|
|
import subprocess
|
2021-07-29 08:42:27 +03:00
|
|
|
import shlex
|
|
|
|
import shutil
|
|
|
|
import tempfile
|
2021-07-25 00:05:04 +03:00
|
|
|
import distutils.ccompiler
|
2021-07-29 08:42:27 +03:00
|
|
|
import distutils.log
|
|
|
|
import distutils.sysconfig
|
|
|
|
from distutils.errors import CompileError, LinkError
|
2020-11-12 22:51:38 +03:00
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
|
|
YELLOW = '\033[93m'
|
|
|
|
END = '\033[0m'
|
|
|
|
WARNING = f"{YELLOW} [WARNING] {END}"
|
|
|
|
|
|
|
|
DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
|
2020-12-03 04:22:16 +03:00
|
|
|
DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
|
2020-11-12 22:51:38 +03:00
|
|
|
|
2021-06-17 00:18:37 +03:00
|
|
|
try:
|
|
|
|
import torch
|
|
|
|
except ImportError:
|
|
|
|
print(
|
|
|
|
f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
|
|
|
|
)
|
|
|
|
|
2020-11-12 22:51:38 +03:00
|
|
|
|
2020-12-03 04:22:16 +03:00
|
|
|
def installed_cuda_version():
|
2020-11-12 22:51:38 +03:00
|
|
|
import torch.utils.cpp_extension
|
|
|
|
cuda_home = torch.utils.cpp_extension.CUDA_HOME
|
|
|
|
assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
|
|
|
|
# Ensure there is not a cuda version mismatch between torch and nvcc compiler
|
|
|
|
output = subprocess.check_output([cuda_home + "/bin/nvcc",
|
|
|
|
"-V"],
|
|
|
|
universal_newlines=True)
|
|
|
|
output_split = output.split()
|
|
|
|
release_idx = output_split.index("release")
|
|
|
|
release = output_split[release_idx + 1].replace(',', '').split(".")
|
|
|
|
# Ignore patch versions, only look at major + minor
|
2020-12-03 04:22:16 +03:00
|
|
|
cuda_major, cuda_minor = release[:2]
|
2020-11-12 22:51:38 +03:00
|
|
|
installed_cuda_version = ".".join(release[:2])
|
2020-12-03 04:22:16 +03:00
|
|
|
return int(cuda_major), int(cuda_minor)
|
|
|
|
|
|
|
|
|
2021-10-02 05:56:32 +03:00
|
|
|
def get_default_compute_capabilities():
|
2020-12-03 04:22:16 +03:00
|
|
|
compute_caps = DEFAULT_COMPUTE_CAPABILITIES
|
2020-12-23 09:26:26 +03:00
|
|
|
import torch.utils.cpp_extension
|
|
|
|
if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
|
|
|
|
)[0] >= 11:
|
2021-01-07 05:12:39 +03:00
|
|
|
if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
|
|
|
|
# Special treatment of CUDA 11.0 because compute_86 is not supported.
|
|
|
|
compute_caps += ";8.0"
|
|
|
|
else:
|
|
|
|
compute_caps += ";8.0;8.6"
|
2020-12-03 04:22:16 +03:00
|
|
|
return compute_caps
|
|
|
|
|
|
|
|
|
2021-06-07 23:50:03 +03:00
|
|
|
# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
|
|
|
|
# to build deepspeed and system-wide installed cuda 11.2
|
|
|
|
cuda_minor_mismatch_ok = {
|
2021-11-15 20:47:16 +03:00
|
|
|
10: [
|
|
|
|
"10.0",
|
|
|
|
"10.1",
|
|
|
|
"10.2",
|
|
|
|
],
|
|
|
|
11: [
|
|
|
|
"11.0",
|
|
|
|
"11.1",
|
|
|
|
"11.2",
|
|
|
|
"11.3",
|
|
|
|
"11.4",
|
|
|
|
"11.5",
|
|
|
|
],
|
2021-06-07 23:50:03 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-03 04:22:16 +03:00
|
|
|
def assert_no_cuda_mismatch():
|
|
|
|
cuda_major, cuda_minor = installed_cuda_version()
|
|
|
|
sys_cuda_version = f'{cuda_major}.{cuda_minor}'
|
2020-11-12 22:51:38 +03:00
|
|
|
torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
|
|
|
|
# This is a show-stopping error, should probably not proceed past this
|
2020-12-03 04:22:16 +03:00
|
|
|
if sys_cuda_version != torch_cuda_version:
|
2021-06-07 23:50:03 +03:00
|
|
|
if (cuda_major in cuda_minor_mismatch_ok
|
|
|
|
and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
|
|
|
|
and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
|
|
|
|
print(f"Installed CUDA version {sys_cuda_version} does not match the "
|
|
|
|
f"version torch was compiled with {torch.version.cuda} "
|
|
|
|
"but since the APIs are compatible, accepting this combination")
|
2020-12-03 08:20:16 +03:00
|
|
|
return
|
2020-11-12 22:51:38 +03:00
|
|
|
raise Exception(
|
2020-12-03 04:22:16 +03:00
|
|
|
f"Installed CUDA version {sys_cuda_version} does not match the "
|
2020-11-12 22:51:38 +03:00
|
|
|
f"version torch was compiled with {torch.version.cuda}, unable to compile "
|
|
|
|
"cuda/cpp extensions without a matching cuda version.")
|
|
|
|
|
|
|
|
|
|
|
|
def assert_torch_info(torch_info):
|
|
|
|
install_torch_version = torch_info['version']
|
|
|
|
install_cuda_version = torch_info['cuda_version']
|
|
|
|
|
|
|
|
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
|
|
|
|
current_torch_version = ".".join(torch.__version__.split('.')[:2])
|
|
|
|
|
|
|
|
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
|
|
|
|
raise RuntimeError(
|
|
|
|
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
|
|
|
|
"with a different version than what is being used at runtime. Please re-install "
|
|
|
|
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
|
|
|
|
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
|
|
|
|
f"torch={current_torch_version}, cuda={current_cuda_version}")
|
|
|
|
|
|
|
|
|
|
|
|
class OpBuilder(ABC):
|
|
|
|
def __init__(self, name):
|
|
|
|
self.name = name
|
|
|
|
self.jit_mode = False
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def absolute_name(self):
|
|
|
|
'''
|
|
|
|
Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
|
|
|
|
will be installed as something like: deepspeed/ops/adam/cpu_adam.so
|
|
|
|
'''
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def sources(self):
|
|
|
|
'''
|
|
|
|
Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
|
|
|
|
'''
|
|
|
|
pass
|
|
|
|
|
|
|
|
def include_paths(self):
|
|
|
|
'''
|
|
|
|
Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
|
|
|
|
'''
|
|
|
|
return []
|
|
|
|
|
|
|
|
def nvcc_args(self):
|
|
|
|
'''
|
|
|
|
Returns optional list of compiler flags to forward to nvcc when building CUDA sources
|
|
|
|
'''
|
|
|
|
return []
|
|
|
|
|
|
|
|
def cxx_args(self):
|
|
|
|
'''
|
|
|
|
Returns optional list of compiler flags to forward to the build
|
|
|
|
'''
|
|
|
|
return []
|
|
|
|
|
|
|
|
def is_compatible(self):
|
|
|
|
'''
|
|
|
|
Check if all non-python dependencies are satisfied to build this op
|
|
|
|
'''
|
|
|
|
return True
|
|
|
|
|
2021-03-08 23:54:54 +03:00
|
|
|
def extra_ldflags(self):
|
|
|
|
return []
|
|
|
|
|
|
|
|
def libraries_installed(self, libraries):
|
|
|
|
valid = False
|
|
|
|
check_cmd = 'dpkg -l'
|
|
|
|
for lib in libraries:
|
|
|
|
result = subprocess.Popen(f'dpkg -l {lib}',
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
shell=True)
|
|
|
|
valid = valid or result.wait() == 0
|
|
|
|
return valid
|
|
|
|
|
2021-07-29 08:42:27 +03:00
|
|
|
def has_function(self, funcname, libraries, verbose=False):
|
|
|
|
'''
|
|
|
|
Test for existence of a function within a tuple of libraries.
|
|
|
|
|
2021-10-02 05:56:32 +03:00
|
|
|
This is used as a smoke test to check whether a certain library is available.
|
2021-07-29 08:42:27 +03:00
|
|
|
As a test, this creates a simple C program that calls the specified function,
|
|
|
|
and then distutils is used to compile that program and link it with the specified libraries.
|
|
|
|
Returns True if both the compile and link are successful, False otherwise.
|
|
|
|
'''
|
|
|
|
tempdir = None # we create a temporary directory to hold various files
|
|
|
|
filestderr = None # handle to open file to which we redirect stderr
|
|
|
|
oldstderr = None # file descriptor for stderr
|
|
|
|
try:
|
|
|
|
# Echo compile and link commands that are used.
|
|
|
|
if verbose:
|
|
|
|
distutils.log.set_verbosity(1)
|
|
|
|
|
|
|
|
# Create a compiler object.
|
|
|
|
compiler = distutils.ccompiler.new_compiler(verbose=verbose)
|
|
|
|
|
|
|
|
# Configure compiler and linker to build according to Python install.
|
|
|
|
distutils.sysconfig.customize_compiler(compiler)
|
|
|
|
|
|
|
|
# Create a temporary directory to hold test files.
|
|
|
|
tempdir = tempfile.mkdtemp()
|
|
|
|
|
|
|
|
# Define a simple C program that calls the function in question
|
|
|
|
prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (
|
|
|
|
funcname,
|
|
|
|
funcname)
|
|
|
|
|
|
|
|
# Write the test program to a file.
|
|
|
|
filename = os.path.join(tempdir, 'test.c')
|
|
|
|
with open(filename, 'w') as f:
|
|
|
|
f.write(prog)
|
|
|
|
|
|
|
|
# Redirect stderr file descriptor to a file to silence compile/link warnings.
|
|
|
|
if not verbose:
|
|
|
|
filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
|
|
|
|
oldstderr = os.dup(sys.stderr.fileno())
|
|
|
|
os.dup2(filestderr.fileno(), sys.stderr.fileno())
|
|
|
|
|
2021-11-03 00:41:10 +03:00
|
|
|
# Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
|
|
|
|
# Otherwise, a local directory will be used instead of tempdir
|
|
|
|
drive, driveless_filename = os.path.splitdrive(filename)
|
|
|
|
root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
|
|
|
|
output_dir = os.path.join(drive, root_dir)
|
|
|
|
|
2021-07-29 08:42:27 +03:00
|
|
|
# Attempt to compile the C program into an object file.
|
|
|
|
cflags = shlex.split(os.environ.get('CFLAGS', ""))
|
|
|
|
objs = compiler.compile([filename],
|
2021-11-03 00:41:10 +03:00
|
|
|
output_dir=output_dir,
|
2021-07-29 08:42:27 +03:00
|
|
|
extra_preargs=self.strip_empty_entries(cflags))
|
|
|
|
|
|
|
|
# Attempt to link the object file into an executable.
|
|
|
|
# Be sure to tack on any libraries that have been specified.
|
|
|
|
ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
|
|
|
|
compiler.link_executable(objs,
|
|
|
|
os.path.join(tempdir,
|
|
|
|
'a.out'),
|
|
|
|
extra_preargs=self.strip_empty_entries(ldflags),
|
|
|
|
libraries=libraries)
|
|
|
|
|
|
|
|
# Compile and link succeeded
|
|
|
|
return True
|
|
|
|
|
|
|
|
except CompileError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
except LinkError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
except:
|
|
|
|
return False
|
|
|
|
|
|
|
|
finally:
|
|
|
|
# Restore stderr file descriptor and close the stderr redirect file.
|
|
|
|
if oldstderr is not None:
|
|
|
|
os.dup2(oldstderr, sys.stderr.fileno())
|
|
|
|
if filestderr is not None:
|
|
|
|
filestderr.close()
|
|
|
|
|
|
|
|
# Delete the temporary directory holding the test program and stderr files.
|
|
|
|
if tempdir is not None:
|
|
|
|
shutil.rmtree(tempdir)
|
2021-07-25 00:05:04 +03:00
|
|
|
|
2021-07-16 00:46:47 +03:00
|
|
|
def strip_empty_entries(self, args):
|
|
|
|
'''
|
|
|
|
Drop any empty strings from the list of compile and link flags
|
|
|
|
'''
|
|
|
|
return [x for x in args if len(x) > 0]
|
|
|
|
|
|
|
|
def cpu_arch(self):
|
|
|
|
if not self.command_exists('lscpu'):
|
|
|
|
self.warning(
|
|
|
|
f"{self.name} attempted to query 'lscpu' to detect the CPU architecture. "
|
|
|
|
"However, 'lscpu' does not appear to exist on "
|
|
|
|
"your system, will fall back to use -march=native.")
|
|
|
|
return '-march=native'
|
|
|
|
|
|
|
|
result = subprocess.check_output('lscpu', shell=True)
|
|
|
|
result = result.decode('utf-8').strip().lower()
|
|
|
|
if 'ppc64le' in result:
|
|
|
|
# gcc does not provide -march on PowerPC, use -mcpu instead
|
|
|
|
return '-mcpu=native'
|
|
|
|
return '-march=native'
|
|
|
|
|
2021-03-08 23:54:54 +03:00
|
|
|
def simd_width(self):
|
|
|
|
if not self.command_exists('lscpu'):
|
|
|
|
self.warning(
|
2021-07-16 00:46:47 +03:00
|
|
|
f"{self.name} attempted to query 'lscpu' to detect the existence "
|
2021-03-08 23:54:54 +03:00
|
|
|
"of AVX instructions. However, 'lscpu' does not appear to exist on "
|
|
|
|
"your system, will fall back to non-vectorized execution.")
|
2021-07-16 00:46:47 +03:00
|
|
|
return '-D__SCALAR__'
|
2021-03-08 23:54:54 +03:00
|
|
|
|
2021-08-27 21:04:13 +03:00
|
|
|
try:
|
|
|
|
result = subprocess.check_output('lscpu', shell=True)
|
|
|
|
result = result.decode('utf-8').strip().lower()
|
|
|
|
except Exception as e:
|
|
|
|
print(
|
|
|
|
f"{WARNING} {self.name} SIMD_WIDTH cannot be recognized due to {str(e)}!"
|
|
|
|
)
|
|
|
|
return '-D__SCALAR__'
|
|
|
|
|
2021-03-08 23:54:54 +03:00
|
|
|
if 'genuineintel' in result:
|
|
|
|
if 'avx512' in result:
|
|
|
|
return '-D__AVX512__'
|
|
|
|
elif 'avx2' in result:
|
|
|
|
return '-D__AVX256__'
|
2021-07-16 00:46:47 +03:00
|
|
|
return '-D__SCALAR__'
|
2021-03-08 23:54:54 +03:00
|
|
|
|
2020-11-12 22:51:38 +03:00
|
|
|
def python_requirements(self):
|
|
|
|
'''
|
|
|
|
Override if op wants to define special dependencies, otherwise will
|
|
|
|
take self.name and load requirements-<op-name>.txt if it exists.
|
|
|
|
'''
|
|
|
|
path = f'requirements/requirements-{self.name}.txt'
|
|
|
|
requirements = []
|
|
|
|
if os.path.isfile(path):
|
|
|
|
with open(path, 'r') as fd:
|
|
|
|
requirements = [r.strip() for r in fd.readlines()]
|
|
|
|
return requirements
|
|
|
|
|
|
|
|
def command_exists(self, cmd):
|
|
|
|
if '|' in cmd:
|
|
|
|
cmds = cmd.split("|")
|
|
|
|
else:
|
|
|
|
cmds = [cmd]
|
|
|
|
valid = False
|
|
|
|
for cmd in cmds:
|
|
|
|
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
|
|
|
|
valid = valid or result.wait() == 0
|
|
|
|
|
|
|
|
if not valid and len(cmds) > 1:
|
|
|
|
print(
|
|
|
|
f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
|
|
|
|
)
|
|
|
|
elif not valid and len(cmds) == 1:
|
|
|
|
print(
|
|
|
|
f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
|
|
|
|
)
|
|
|
|
return valid
|
|
|
|
|
|
|
|
def warning(self, msg):
|
|
|
|
print(f"{WARNING} {msg}")
|
|
|
|
|
|
|
|
def deepspeed_src_path(self, code_path):
|
|
|
|
if os.path.isabs(code_path):
|
|
|
|
return code_path
|
|
|
|
else:
|
|
|
|
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
|
|
|
|
|
|
|
|
def builder(self):
|
|
|
|
from torch.utils.cpp_extension import CppExtension
|
2021-07-16 00:46:47 +03:00
|
|
|
return CppExtension(
|
|
|
|
name=self.absolute_name(),
|
|
|
|
sources=self.strip_empty_entries(self.sources()),
|
|
|
|
include_dirs=self.strip_empty_entries(self.include_paths()),
|
|
|
|
extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
|
|
|
|
extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
|
2020-11-12 22:51:38 +03:00
|
|
|
|
|
|
|
def load(self, verbose=True):
|
|
|
|
from ...git_version_info import installed_ops, torch_info
|
|
|
|
if installed_ops[self.name]:
|
|
|
|
# Ensure the op we're about to load was compiled with the same
|
|
|
|
# torch/cuda versions we are currently using at runtime.
|
|
|
|
if isinstance(self, CUDAOpBuilder):
|
|
|
|
assert_torch_info(torch_info)
|
|
|
|
|
|
|
|
return importlib.import_module(self.absolute_name())
|
|
|
|
else:
|
|
|
|
return self.jit_load(verbose)
|
|
|
|
|
|
|
|
def jit_load(self, verbose=True):
|
|
|
|
if not self.is_compatible():
|
|
|
|
raise RuntimeError(
|
|
|
|
f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
|
|
|
|
)
|
|
|
|
try:
|
|
|
|
import ninja
|
|
|
|
except ImportError:
|
|
|
|
raise RuntimeError(
|
|
|
|
f"Unable to JIT load the {self.name} op due to ninja not being installed."
|
|
|
|
)
|
|
|
|
|
|
|
|
if isinstance(self, CUDAOpBuilder):
|
|
|
|
assert_no_cuda_mismatch()
|
|
|
|
|
|
|
|
self.jit_mode = True
|
|
|
|
from torch.utils.cpp_extension import load
|
|
|
|
|
|
|
|
# Ensure directory exists to prevent race condition in some cases
|
|
|
|
ext_path = os.path.join(
|
|
|
|
os.environ.get('TORCH_EXTENSIONS_DIR',
|
|
|
|
DEFAULT_TORCH_EXTENSION_PATH),
|
|
|
|
self.name)
|
|
|
|
os.makedirs(ext_path, exist_ok=True)
|
|
|
|
|
|
|
|
start_build = time.time()
|
2021-07-16 00:46:47 +03:00
|
|
|
sources = [self.deepspeed_src_path(path) for path in self.sources()]
|
|
|
|
extra_include_paths = [
|
|
|
|
self.deepspeed_src_path(path) for path in self.include_paths()
|
|
|
|
]
|
2020-11-12 22:51:38 +03:00
|
|
|
op_module = load(
|
|
|
|
name=self.name,
|
2021-07-16 00:46:47 +03:00
|
|
|
sources=self.strip_empty_entries(sources),
|
|
|
|
extra_include_paths=self.strip_empty_entries(extra_include_paths),
|
|
|
|
extra_cflags=self.strip_empty_entries(self.cxx_args()),
|
|
|
|
extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
|
|
|
|
extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
|
2020-11-12 22:51:38 +03:00
|
|
|
verbose=verbose)
|
|
|
|
build_duration = time.time() - start_build
|
|
|
|
if verbose:
|
|
|
|
print(f"Time to load {self.name} op: {build_duration} seconds")
|
|
|
|
return op_module
|
|
|
|
|
|
|
|
|
|
|
|
class CUDAOpBuilder(OpBuilder):
|
2020-12-03 04:22:16 +03:00
|
|
|
def compute_capability_args(self, cross_compile_archs=None):
|
2020-12-07 23:08:41 +03:00
|
|
|
"""
|
|
|
|
Returns nvcc compute capability compile flags.
|
2020-12-03 04:22:16 +03:00
|
|
|
|
2020-12-07 23:08:41 +03:00
|
|
|
1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
|
|
|
|
2. If neither is set default compute capabilities will be used
|
2020-12-11 21:15:33 +03:00
|
|
|
3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
|
2020-12-07 23:08:41 +03:00
|
|
|
|
|
|
|
Format:
|
|
|
|
|
|
|
|
- `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
|
|
|
|
|
|
|
|
TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
|
|
|
|
TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
|
|
|
|
|
|
|
|
- `cross_compile_archs` uses ; separator.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
ccs = []
|
2020-11-12 22:51:38 +03:00
|
|
|
if self.jit_mode:
|
2020-12-07 23:08:41 +03:00
|
|
|
# Compile for underlying architectures since we know those at runtime
|
|
|
|
for i in range(torch.cuda.device_count()):
|
|
|
|
CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
|
|
|
|
cc = f"{CC_MAJOR}.{CC_MINOR}"
|
|
|
|
if cc not in ccs:
|
|
|
|
ccs.append(cc)
|
|
|
|
ccs = sorted(ccs)
|
2020-12-11 21:15:33 +03:00
|
|
|
ccs[-1] += '+PTX'
|
2020-11-12 22:51:38 +03:00
|
|
|
else:
|
|
|
|
# Cross-compile mode, compile for various architectures
|
2020-12-07 23:08:41 +03:00
|
|
|
# env override takes priority
|
|
|
|
cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
|
|
|
|
if cross_compile_archs_env is not None:
|
|
|
|
if cross_compile_archs is not None:
|
|
|
|
print(
|
|
|
|
f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
|
|
|
|
)
|
|
|
|
cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
|
|
|
|
else:
|
|
|
|
if cross_compile_archs is None:
|
2021-10-02 05:56:32 +03:00
|
|
|
cross_compile_archs = get_default_compute_capabilities()
|
2020-12-07 23:08:41 +03:00
|
|
|
ccs = cross_compile_archs.split(';')
|
|
|
|
|
|
|
|
args = []
|
|
|
|
for cc in ccs:
|
2020-12-11 21:15:33 +03:00
|
|
|
num = cc[0] + cc[2]
|
|
|
|
args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
|
|
|
|
if cc.endswith('+PTX'):
|
|
|
|
args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
|
2020-12-07 23:08:41 +03:00
|
|
|
|
2020-11-12 22:51:38 +03:00
|
|
|
return args
|
|
|
|
|
|
|
|
def version_dependent_macros(self):
|
|
|
|
# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
|
|
|
|
TORCH_MAJOR = int(torch.__version__.split('.')[0])
|
|
|
|
TORCH_MINOR = int(torch.__version__.split('.')[1])
|
|
|
|
version_ge_1_1 = []
|
|
|
|
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
|
|
|
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
|
|
|
version_ge_1_3 = []
|
|
|
|
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
|
|
|
|
version_ge_1_3 = ['-DVERSION_GE_1_3']
|
|
|
|
version_ge_1_5 = []
|
|
|
|
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
|
|
|
|
version_ge_1_5 = ['-DVERSION_GE_1_5']
|
|
|
|
return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
|
|
|
|
|
|
|
|
def is_compatible(self):
|
|
|
|
return super().is_compatible()
|
|
|
|
|
|
|
|
def builder(self):
|
|
|
|
from torch.utils.cpp_extension import CUDAExtension
|
|
|
|
assert_no_cuda_mismatch()
|
|
|
|
return CUDAExtension(name=self.absolute_name(),
|
2021-07-16 00:46:47 +03:00
|
|
|
sources=self.strip_empty_entries(self.sources()),
|
|
|
|
include_dirs=self.strip_empty_entries(self.include_paths()),
|
|
|
|
libraries=self.strip_empty_entries(self.libraries_args()),
|
2020-11-12 22:51:38 +03:00
|
|
|
extra_compile_args={
|
2021-07-16 00:46:47 +03:00
|
|
|
'cxx': self.strip_empty_entries(self.cxx_args()),
|
|
|
|
'nvcc': self.strip_empty_entries(self.nvcc_args())
|
2020-11-12 22:51:38 +03:00
|
|
|
})
|
2021-05-24 11:10:39 +03:00
|
|
|
|
|
|
|
def cxx_args(self):
|
|
|
|
if sys.platform == "win32":
|
|
|
|
return ['-O2']
|
|
|
|
else:
|
|
|
|
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
|
|
|
|
|
2021-06-10 03:24:43 +03:00
|
|
|
def nvcc_args(self):
|
|
|
|
args = [
|
|
|
|
'-O3',
|
|
|
|
'--use_fast_math',
|
|
|
|
'-std=c++17' if sys.platform == "win32" else '-std=c++14',
|
|
|
|
'-U__CUDA_NO_HALF_OPERATORS__',
|
|
|
|
'-U__CUDA_NO_HALF_CONVERSIONS__',
|
|
|
|
'-U__CUDA_NO_HALF2_OPERATORS__'
|
|
|
|
]
|
|
|
|
|
|
|
|
return args + self.compute_capability_args()
|
|
|
|
|
2021-05-24 11:10:39 +03:00
|
|
|
def libraries_args(self):
|
|
|
|
if sys.platform == "win32":
|
|
|
|
return ['cublas', 'curand']
|
|
|
|
else:
|
|
|
|
return []
|