зеркало из https://github.com/microsoft/DeepSpeed.git
[Bug Fix] Support threads_per_head < 64 for wavefront size of 64 (#6622)
When launching apply_rotary_pos_half kernel, only threads_per_head of 64 is supported for wavefront size of 64. This change adds support for threads_per_head < 64 such as 4, 8, 16. Fixes the issue introduced in https://github.com/microsoft/DeepSpeed/pull/5402 --------- Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Logan Adams <loadams@microsoft.com>
This commit is contained in:
Родитель
6c08b7f932
Коммит
2b41d6212c
|
@ -101,7 +101,15 @@ __global__ void apply_rotary_pos_half(T* mixed_query,
|
|||
|
||||
#if defined(__HIP_PLATFORM_AMD__) and ROCM_WAVEFRONT_SIZE == 64
|
||||
#define LAUNCH_FOR_ALIGNMENT(ALIGNMENT) \
|
||||
if (threads_per_head == 64) { \
|
||||
if (threads_per_head == 4) { \
|
||||
LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT); \
|
||||
} else if (threads_per_head == 8) { \
|
||||
LAUNCH_ROT_POS_EMB_HALF(8, ALIGNMENT); \
|
||||
} else if (threads_per_head == 16) { \
|
||||
LAUNCH_ROT_POS_EMB_HALF(16, ALIGNMENT); \
|
||||
} else if (threads_per_head == 32) { \
|
||||
LAUNCH_ROT_POS_EMB_HALF(32, ALIGNMENT); \
|
||||
} else if (threads_per_head == 64) { \
|
||||
LAUNCH_ROT_POS_EMB_HALF(64, ALIGNMENT); \
|
||||
} else { \
|
||||
assert(false); \
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# DeepSpeed Team
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import deepspeed
|
||||
from deepspeed.ops.op_builder import InferenceBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
|
||||
pytest.skip("Inference ops are not available on this system", allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.mark.inference_ops
|
||||
@pytest.mark.parametrize("num_heads", [64, 32, 16, 8])
|
||||
def test_rope_warp_size_alignment(num_heads):
|
||||
if get_accelerator().device_name() != "cuda":
|
||||
pytest.skip("This test runs only on GPU")
|
||||
|
||||
batch = 1
|
||||
head = 8
|
||||
seq_len = 1024
|
||||
head_dim = 32
|
||||
rotary_dim = 32
|
||||
offset = 8
|
||||
rotate_half = False
|
||||
rope_theta = 2
|
||||
|
||||
cuda0 = torch.device('cuda:0')
|
||||
query = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
|
||||
key = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
|
||||
|
||||
inference = InferenceBuilder().load()
|
||||
# For num_heads values of 64, 32, 16, 8
|
||||
# corresponding threads_per_head (defined in apply_rotary_pos_emb.cu) values are 4, 8, 16, 32
|
||||
inference.apply_rotary_pos_emb(query, key, rotary_dim, offset, num_heads, rotate_half, rope_theta)
|
Загрузка…
Ссылка в новой задаче