зеркало из https://github.com/microsoft/DeepSpeed.git
101 строка
3.5 KiB
Python
101 строка
3.5 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
try:
|
|
from packaging import version as pkg_version
|
|
except ImportError:
|
|
pkg_version = None
|
|
|
|
from .builder import CUDAOpBuilder, installed_cuda_version
|
|
|
|
|
|
class FPQuantizerBuilder(CUDAOpBuilder):
|
|
BUILD_VAR = "DS_BUILD_FP_QUANTIZER"
|
|
NAME = "fp_quantizer"
|
|
|
|
def __init__(self, name=None):
|
|
name = self.NAME if name is None else name
|
|
super().__init__(name=name)
|
|
|
|
def absolute_name(self):
|
|
return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
|
|
|
|
def is_compatible(self, verbose=False):
|
|
try:
|
|
import torch
|
|
except ImportError:
|
|
if verbose:
|
|
self.warning("Please install torch if trying to pre-compile inference kernels")
|
|
return False
|
|
|
|
cuda_okay = True
|
|
if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
|
|
sys_cuda_major, _ = installed_cuda_version()
|
|
torch_cuda_major = int(torch.version.cuda.split('.')[0])
|
|
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
|
|
if cuda_capability < 8:
|
|
if verbose:
|
|
self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
|
|
cuda_okay = False
|
|
if cuda_capability >= 8:
|
|
if torch_cuda_major < 11 or sys_cuda_major < 11:
|
|
if verbose:
|
|
self.warning("On Ampere and higher architectures please use CUDA 11+")
|
|
cuda_okay = False
|
|
|
|
try:
|
|
import triton
|
|
except ImportError:
|
|
if verbose:
|
|
self.warning(
|
|
f"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
|
|
return False
|
|
|
|
# triton 2.3.{0,1} and 3.0.0 are ok.
|
|
allowed_versions = ("2.3", "3.0")
|
|
if pkg_version:
|
|
allowed = (pkg_version.parse(v) for v in allowed_versions)
|
|
installed_triton = pkg_version.parse(triton.__version__)
|
|
triton_mismatch = all(installed_triton.major != a.major or installed_triton.minor != a.minor
|
|
for a in allowed)
|
|
else:
|
|
installed_triton = triton.__version__
|
|
major, minor, _ = installed_triton.split(".")
|
|
allowed = (v.split(".") for v in allowed_versions)
|
|
triton_mismatch = all(major != v[0] or minor != v[1] for v in allowed)
|
|
|
|
if triton_mismatch:
|
|
if verbose:
|
|
self.warning(
|
|
f"FP Quantizer is using an untested triton version ({installed_triton}), only 2.3.{0,1} and 3.0.0 are known to be compatible with these kernels"
|
|
)
|
|
return False
|
|
|
|
return super().is_compatible(verbose) and cuda_okay
|
|
|
|
def filter_ccs(self, ccs):
|
|
ccs_retained = []
|
|
ccs_pruned = []
|
|
for cc in ccs:
|
|
if int(cc[0]) >= 8:
|
|
ccs_retained.append(cc)
|
|
else:
|
|
ccs_pruned.append(cc)
|
|
if len(ccs_pruned) > 0:
|
|
self.warning(f"Filtered compute capabilities {ccs_pruned}")
|
|
return ccs_retained
|
|
|
|
def sources(self):
|
|
return [
|
|
"csrc/fp_quantizer/fp_quantize.cu",
|
|
"csrc/fp_quantizer/fp_quantize.cpp",
|
|
]
|
|
|
|
def extra_ldflags(self):
|
|
return ['-lcurand']
|
|
|
|
def include_paths(self):
|
|
return ['csrc/fp_quantizer/includes', 'csrc/includes']
|