зеркало из https://github.com/microsoft/DeepSpeed.git
[CPU] Skip CPU support unimplemented error (#3633)
* skip cpu support unimplemented error and update cpu inference workflow * add torch.bfloat16 to cuda_accelerator * remove UtilsBuilder skip * fused adam can build * use cpu adam to implement fused adam * enable zero stage 1 and 2 for synchronized accelerator (a.k.a. CPU) * remove unused parameters * remove skip FusedAdamBuilder; add suported_dtypes * fix format * Revert "fix format" Revert "remove skip FusedAdamBuilder; add suported_dtypes" Revert "remove unused parameters" Revert "enable zero stage 1 and 2 for synchronized accelerator (a.k.a. CPU)" Revert "use cpu adam to implement fused adam" Revert "fused adam can build" --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Ma, Guokai <guokai.ma@intel.com>
This commit is contained in:
Родитель
c79a104cf5
Коммит
7290aace9b
|
@ -76,5 +76,4 @@ jobs:
|
||||||
source oneCCL/build/_install/env/setvars.sh
|
source oneCCL/build/_install/env/setvars.sh
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
cd tests
|
cd tests
|
||||||
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'inference' unit/inference/test_inference_config.py
|
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/
|
||||||
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ pytest $PYTEST_OPTS -k TestDistAllReduce unit/comm/test_dist.py
|
|
||||||
|
|
|
@ -156,6 +156,10 @@ class DeepSpeedAccelerator(ABC):
|
||||||
def is_fp16_supported(self):
|
def is_fp16_supported(self):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def supported_dtypes(self):
|
||||||
|
...
|
||||||
|
|
||||||
# Misc
|
# Misc
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def amp(self):
|
def amp(self):
|
||||||
|
|
|
@ -183,7 +183,10 @@ class CPU_Accelerator(DeepSpeedAccelerator):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_fp16_supported(self):
|
def is_fp16_supported(self):
|
||||||
return True
|
return False
|
||||||
|
|
||||||
|
def supported_dtypes(self):
|
||||||
|
return [torch.float, torch.bfloat16]
|
||||||
|
|
||||||
# Tensor operations
|
# Tensor operations
|
||||||
|
|
||||||
|
|
|
@ -147,6 +147,9 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def supported_dtypes(self):
|
||||||
|
return [torch.float, torch.half, torch.bfloat16]
|
||||||
|
|
||||||
# Misc
|
# Misc
|
||||||
def amp(self):
|
def amp(self):
|
||||||
if hasattr(torch.cuda, 'amp'):
|
if hasattr(torch.cuda, 'amp'):
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -155,6 +155,7 @@ install_ops = dict.fromkeys(ALL_OPS.keys(), False)
|
||||||
for op_name, builder in ALL_OPS.items():
|
for op_name, builder in ALL_OPS.items():
|
||||||
op_compatible = builder.is_compatible()
|
op_compatible = builder.is_compatible()
|
||||||
compatible_ops[op_name] = op_compatible
|
compatible_ops[op_name] = op_compatible
|
||||||
|
compatible_ops["deepspeed_not_implemented"] = False
|
||||||
|
|
||||||
# If op is requested but not available, throw an error.
|
# If op is requested but not available, throw an error.
|
||||||
if op_enabled(op_name) and not op_compatible:
|
if op_enabled(op_name) and not op_compatible:
|
||||||
|
|
|
@ -5,10 +5,15 @@
|
||||||
|
|
||||||
import deepspeed
|
import deepspeed
|
||||||
|
|
||||||
|
import pytest
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
from unit.simple_model import *
|
from unit.simple_model import *
|
||||||
|
|
||||||
from unit.checkpoint.common import checkpoint_correctness_verification
|
from unit.checkpoint.common import checkpoint_correctness_verification
|
||||||
|
from deepspeed.ops.op_builder import FusedAdamBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
class TestLatestCheckpoint(DistributedTest):
|
class TestLatestCheckpoint(DistributedTest):
|
||||||
|
|
|
@ -13,6 +13,10 @@ from unit.simple_model import SimpleModel
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from deepspeed.ops.op_builder import FusedAdamBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
class TestInit(DistributedTest):
|
class TestInit(DistributedTest):
|
||||||
|
|
|
@ -9,6 +9,10 @@ from unit.common import DistributedTest
|
||||||
from deepspeed.git_version_info import version as ds_version
|
from deepspeed.git_version_info import version as ds_version
|
||||||
import os
|
import os
|
||||||
from unit.simple_model import SimpleModel
|
from unit.simple_model import SimpleModel
|
||||||
|
from deepspeed.ops.op_builder import FusedAdamBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -9,6 +9,7 @@ import pytest
|
||||||
import deepspeed
|
import deepspeed
|
||||||
from deepspeed.ops.op_builder import OpBuilder
|
from deepspeed.ops.op_builder import OpBuilder
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
|
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ class TestHybridEngineTextGen(DistributedTest):
|
||||||
tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
|
tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
if torch.is_tensor(tokens[t]):
|
if torch.is_tensor(tokens[t]):
|
||||||
tokens[t] = tokens[t].to(f'cuda:{local_rank}')
|
tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
|
||||||
output = model.generate(**tokens, do_sample=False, max_length=100)
|
output = model.generate(**tokens, do_sample=False, max_length=100)
|
||||||
outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
|
outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||||
return outputs
|
return outputs
|
||||||
|
@ -39,7 +40,7 @@ class TestHybridEngineTextGen(DistributedTest):
|
||||||
model_config.dropout = 0.0
|
model_config.dropout = 0.0
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
|
model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
|
||||||
model = model.half()
|
model = model.half()
|
||||||
model = model.to(f'cuda:{local_rank}')
|
model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def get_tokenizer(self, model_name):
|
def get_tokenizer(self, model_name):
|
||||||
|
|
|
@ -9,6 +9,7 @@ import pytest
|
||||||
import deepspeed
|
import deepspeed
|
||||||
from deepspeed.ops.op_builder import OpBuilder
|
from deepspeed.ops.op_builder import OpBuilder
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
|
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ class TestHybridEngineLlama(DistributedTest):
|
||||||
tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
|
tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
if torch.is_tensor(tokens[t]):
|
if torch.is_tensor(tokens[t]):
|
||||||
tokens[t] = tokens[t].to(f'cuda:{local_rank}')
|
tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
|
||||||
#output = model.generate(**tokens, do_sample=False, max_length=100)
|
#output = model.generate(**tokens, do_sample=False, max_length=100)
|
||||||
output = model.generate(tokens.input_ids, do_sample=False, max_length=100)
|
output = model.generate(tokens.input_ids, do_sample=False, max_length=100)
|
||||||
outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
|
outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||||
|
@ -42,7 +43,7 @@ class TestHybridEngineLlama(DistributedTest):
|
||||||
# Make the model smaller so we can run it on a single GPU in CI
|
# Make the model smaller so we can run it on a single GPU in CI
|
||||||
_ = [model.model.layers.pop(-1) for _ in range(8)]
|
_ = [model.model.layers.pop(-1) for _ in range(8)]
|
||||||
model = model.half()
|
model = model.half()
|
||||||
model = model.to(f'cuda:{local_rank}')
|
model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def get_tokenizer(self, model_name):
|
def get_tokenizer(self, model_name):
|
||||||
|
|
|
@ -13,6 +13,10 @@ from transformers import AutoConfig, AutoModelForCausalLM
|
||||||
import deepspeed.comm as dist
|
import deepspeed.comm as dist
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from transformers.utils import is_offline_mode
|
from transformers.utils import is_offline_mode
|
||||||
|
from deepspeed.ops.op_builder import InferenceBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
def check_dtype(model, expected_dtype):
|
def check_dtype(model, expected_dtype):
|
||||||
|
|
|
@ -20,6 +20,10 @@ from huggingface_hub import HfApi
|
||||||
from deepspeed.model_implementations import DeepSpeedTransformerInference
|
from deepspeed.model_implementations import DeepSpeedTransformerInference
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
from deepspeed.ops.op_builder import InferenceBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
rocm_version = OpBuilder.installed_rocm_version()
|
rocm_version = OpBuilder.installed_rocm_version()
|
||||||
if rocm_version != (0, 0):
|
if rocm_version != (0, 0):
|
||||||
|
|
|
@ -11,6 +11,10 @@ import deepspeed
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
from deepspeed.ops.op_builder import InferenceBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
|
||||||
|
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.inference
|
@pytest.mark.inference
|
||||||
|
|
|
@ -20,6 +20,8 @@ from unit.common import DistributedTest, is_rocm_pytorch
|
||||||
#pytest.skip(
|
#pytest.skip(
|
||||||
# "transformer kernels are temporarily disabled because of unexplained failures",
|
# "transformer kernels are temporarily disabled because of unexplained failures",
|
||||||
# allow_module_level=True)
|
# allow_module_level=True)
|
||||||
|
if torch.half not in get_accelerator().supported_dtypes():
|
||||||
|
pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
def check_equal(first, second, atol=1e-2, verbose=False):
|
def check_equal(first, second, atol=1e-2, verbose=False):
|
||||||
|
|
|
@ -15,6 +15,9 @@ from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
|
|
||||||
|
if torch.half not in get_accelerator().supported_dtypes():
|
||||||
|
pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
def check_equal(first, second, atol=1e-2, verbose=False):
|
def check_equal(first, second, atol=1e-2, verbose=False):
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
|
@ -11,7 +11,10 @@ from deepspeed.ops.adam import FusedAdam
|
||||||
from deepspeed.ops.adam import DeepSpeedCPUAdam
|
from deepspeed.ops.adam import DeepSpeedCPUAdam
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
from unit.simple_model import SimpleModel
|
from unit.simple_model import SimpleModel
|
||||||
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
|
if torch.half not in get_accelerator().supported_dtypes():
|
||||||
|
pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
#'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer
|
#'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer
|
||||||
adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
|
adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
|
||||||
|
|
|
@ -5,8 +5,12 @@
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import pytest
|
import pytest
|
||||||
|
import deepspeed
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
from deepspeed.ops import op_builder
|
from deepspeed.ops.op_builder import QuantizerBuilder
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
|
||||||
|
pytest.skip("Inference ops are not available on this system", allow_module_level=True)
|
||||||
|
|
||||||
quantizer_cuda_module = None
|
quantizer_cuda_module = None
|
||||||
|
|
||||||
|
@ -36,7 +40,7 @@ def run_quant_dequant(inputs, groups, bits):
|
||||||
global quantizer_cuda_module
|
global quantizer_cuda_module
|
||||||
|
|
||||||
if quantizer_cuda_module is None:
|
if quantizer_cuda_module is None:
|
||||||
quantizer_cuda_module = op_builder.QuantizerBuilder().load()
|
quantizer_cuda_module = QuantizerBuilder().load()
|
||||||
return quantizer_cuda_module.ds_quantize_fp16(inputs, groups, bits)
|
return quantizer_cuda_module.ds_quantize_fp16(inputs, groups, bits)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,16 +5,20 @@
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from deepspeed.ops import op_builder
|
import deepspeed
|
||||||
|
from deepspeed.ops.op_builder import QuantizerBuilder
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
|
||||||
|
pytest.skip("Inference ops are not available on this system", allow_module_level=True)
|
||||||
|
|
||||||
inference_module = None
|
inference_module = None
|
||||||
|
|
||||||
|
|
||||||
def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
|
def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
|
||||||
global inference_module
|
global inference_module
|
||||||
if inference_module is None:
|
if inference_module is None:
|
||||||
inference_module = op_builder.QuantizerBuilder().load()
|
inference_module = QuantizerBuilder().load()
|
||||||
|
|
||||||
return inference_module.quantize(activations, num_groups, q_bits,
|
return inference_module.quantize(activations, num_groups, q_bits,
|
||||||
inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric)
|
inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric)
|
||||||
|
@ -23,7 +27,7 @@ def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
|
||||||
def run_dequantize_ds(activations, params, num_groups, q_bits, is_symmetric_quant):
|
def run_dequantize_ds(activations, params, num_groups, q_bits, is_symmetric_quant):
|
||||||
global inference_module
|
global inference_module
|
||||||
if inference_module is None:
|
if inference_module is None:
|
||||||
inference_module = op_builder.QuantizerBuilder().load()
|
inference_module = QuantizerBuilder().load()
|
||||||
return inference_module.dequantize(
|
return inference_module.dequantize(
|
||||||
activations,
|
activations,
|
||||||
params,
|
params,
|
||||||
|
|
|
@ -5,9 +5,14 @@
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
import deepspeed
|
||||||
|
from deepspeed.ops.op_builder import SpatialInferenceBuilder
|
||||||
from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add
|
from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
|
if not deepspeed.ops.__compatible_ops__[SpatialInferenceBuilder.NAME]:
|
||||||
|
pytest.skip("Inference ops are not available on this system", allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
def allclose(x, y):
|
def allclose(x, y):
|
||||||
assert x.dtype == y.dtype
|
assert x.dtype == y.dtype
|
||||||
|
|
|
@ -10,6 +10,10 @@ from deepspeed.profiling.flops_profiler import get_model_profile
|
||||||
from unit.simple_model import SimpleModel, random_dataloader
|
from unit.simple_model import SimpleModel, random_dataloader
|
||||||
from unit.common import DistributedTest
|
from unit.common import DistributedTest
|
||||||
from unit.util import required_minimum_torch_version
|
from unit.util import required_minimum_torch_version
|
||||||
|
from deepspeed.accelerator import get_accelerator
|
||||||
|
|
||||||
|
if torch.half not in get_accelerator().supported_dtypes():
|
||||||
|
pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=3),
|
pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=3),
|
||||||
reason='requires Pytorch version 1.3 or above')
|
reason='requires Pytorch version 1.3 or above')
|
||||||
|
|
Загрузка…
Ссылка в новой задаче