Benchmarks: micro benchmarks - add int8 support for cublaslt function (#574)
**Description** add int8 support for cublaslt function.
This commit is contained in:
Родитель
c7800bb8e0
Коммит
f53d941a22
|
@ -23,7 +23,7 @@ class CublasLtBenchmark(MicroBenchmarkWithInvoke):
|
|||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'cublaslt_gemm'
|
||||
self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']
|
||||
self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8']
|
||||
|
||||
def mrange(self, start, stop=-1, multiplication_factor=2):
|
||||
"""Range constructor with multiplication factor.
|
||||
|
|
|
@ -16,6 +16,7 @@ using fp16 = half;
|
|||
using bf16 = nv_bfloat16;
|
||||
using fp8e4m3 = __nv_fp8_e4m3;
|
||||
using fp8e5m2 = __nv_fp8_e5m2;
|
||||
using int8 = int8_t;
|
||||
|
||||
struct Args {
|
||||
int m = 16;
|
||||
|
@ -84,6 +85,8 @@ template <typename T> cudaDataType_t get_datatype() {
|
|||
return CUDA_R_8F_E4M3;
|
||||
if (std::is_same<T, fp8e5m2>::value)
|
||||
return CUDA_R_8F_E5M2;
|
||||
if (std::is_same<T, int8>::value)
|
||||
return CUDA_R_8I;
|
||||
throw std::invalid_argument("Unknown type");
|
||||
}
|
||||
|
||||
|
@ -162,6 +165,8 @@ int main(int argc, char **argv) {
|
|||
run<fp8e4m3, fp8e4m3, fp16>(&args);
|
||||
else if (args.in_type == "fp8e5m2")
|
||||
run<fp8e5m2, fp8e4m3, fp16>(&args);
|
||||
else if (args.in_type == "int8")
|
||||
run<int8>(&args);
|
||||
else
|
||||
throw std::invalid_argument("Unknown type " + args.in_type);
|
||||
|
||||
|
|
|
@ -62,6 +62,8 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
|
|||
gemm_compute_type = CUBLAS_COMPUTE_32F;
|
||||
if (a_type == CUDA_R_64F || b_type == CUDA_R_64F)
|
||||
gemm_compute_type = CUBLAS_COMPUTE_64F;
|
||||
if (a_type == CUDA_R_8I)
|
||||
gemm_compute_type = CUBLAS_COMPUTE_32I;
|
||||
|
||||
cublasLtMatmulDesc_t op_desc = nullptr;
|
||||
CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));
|
||||
|
|
|
@ -63,15 +63,15 @@ class CublasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
|
|||
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
|
||||
benchmark = benchmark_cls(
|
||||
self.benchmark_name,
|
||||
parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64',
|
||||
parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64 int8',
|
||||
)
|
||||
self.assertTrue(benchmark._preprocess())
|
||||
self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands))
|
||||
self.assertEqual(4 * (2 * 2 * 3 + 2) * len(benchmark._args.in_types), len(benchmark._commands))
|
||||
|
||||
def cmd(t, b, m, n, k):
|
||||
return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}'
|
||||
|
||||
for _t in ['fp16', 'fp32', 'fp64']:
|
||||
for _t in ['fp16', 'fp32', 'fp64', 'int8']:
|
||||
for _b in [2, 4, 8, 16]:
|
||||
for _m in [2, 4]:
|
||||
for _n in [4, 8]:
|
||||
|
|
Загрузка…
Ссылка в новой задаче