From e304cf15728480ce9986e50fe30a7ca25ee40a3d Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Fri, 26 Jul 2024 10:42:17 +0800
Subject: [PATCH] Benchmarks: Micro benchmarks - add support for NVIDIA
 L4/L40/L40s GPUs in gemm-flops (#634)

**Description**
Add support GPU ARCH 8.9 for NVIDIA L4/L40/L40s GPUs in gemm-flops.
---
 superbench/benchmarks/micro_benchmarks/cuda_common.cmake        | 2 +-
 .../benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py  | 2 ++
 third_party/Makefile                                            | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
index d5cadba9..a2bf7ee8 100644
--- a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
@@ -33,6 +33,6 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
       list(APPEND NVCC_ARCHS_SUPPORTED 86)
     endif()
     if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.8)
-      list(APPEND NVCC_ARCHS_SUPPORTED 90)
+      list(APPEND NVCC_ARCHS_SUPPORTED 89 90)
     endif()
 endif()
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
index 8982905c..fac19a8b 100644
--- a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
@@ -47,6 +47,8 @@ class CudaGemmFlopsBenchmark(GemmFlopsBenchmark):
         # Skip FP64 for RTX Turing/Ampere and Tesla T4/GA10x due to very limited FP64 TFLOP rate
         self.__kernel_map[7.5] = {k: self.__kernel_map[7.0][k] for k in self.__kernel_map[7.0] if 'fp64' not in k}
         self.__kernel_map[8.6] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'fp64' not in k}
+        # Skip FP64 for Ada Lovelace L4/L40 due to no native CUDA/Tensor Cores
+        self.__kernel_map[8.9] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'fp64' not in k}
         # Skip INT4 for Hopper due to no native CUDA/Tensor Cores
         self.__kernel_map[9.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
         self.__parse_logline = [
diff --git a/third_party/Makefile b/third_party/Makefile
index 69623af8..0a47bd45 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -33,7 +33,7 @@ sb_micro_path:
 # Build cutlass.
 cuda_cutlass:
 ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
-	$(eval ARCHS := "70;75;80;86;90")
+	$(eval ARCHS := "70;75;80;86;89;90")
 else
 	$(eval ARCHS := "70;75;80;86")
 endif