Benchmarks: Add Benchmark - Add the source code of cuda kernel launch overhead benchmark. (#71)

* add cuda kernel launch overhead benchmark - source part. * can customize the nvcc_archs_support. * set SB_MICRO_PATH for azure pipeline tests.
2021-05-18 14:07:27 +08:00 · 2021-05-18 14:07:27 +08:00 · 7cfe7c16cf
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@ -16,13 +16,13 @@ steps:
      python3 -m pip install .[test,torch]
    displayName: Install dependencies
  - script: |
-      make cppbuild
+      SB_MICRO_PATH=$PWD/bin make cppbuild
    displayName: Build benchmarks
  - script: |
      python3 setup.py lint
    displayName: Run code lint
  - script: |
-      python3 setup.py test
+      SB_MICRO_PATH=$PWD/bin python3 setup.py test
    displayName: Run unit tests
    timeoutInMinutes: 10
  - script: |
--- a/.clang-format
+++ b/.clang-format
@ -2,3 +2,4 @@
 BasedOnStyle: LLVM
 ColumnLimit: 120
 IndentWidth: 4
--- a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
+++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
@ -0,0 +1,33 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 if(NOT DEFINED CMAKE_CUDA_STANDARD)
    set(CMAKE_CUDA_STANDARD 11)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
 if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
    # Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
    set(NVCC_ARCHS_SUPPORTED "")
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5)
      list(APPEND NVCC_ARCHS_SUPPORTED 53)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0)
      list(APPEND NVCC_ARCHS_SUPPORTED 60 61)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0)
      list(APPEND NVCC_ARCHS_SUPPORTED 70)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
      list(APPEND NVCC_ARCHS_SUPPORTED 72)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0)
      list(APPEND NVCC_ARCHS_SUPPORTED 75)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
      list(APPEND NVCC_ARCHS_SUPPORTED 80)
    endif()
    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1)
      list(APPEND NVCC_ARCHS_SUPPORTED 86)
    endif()
 endif()
--- a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
@ -0,0 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 cmake_minimum_required(VERSION 3.18)
 project(kernel_launch_overhead LANGUAGES CUDA CXX)
 include(../cuda_common.cmake)
 add_executable(kernel_launch_overhead cuda_kernel_launch.cu)
 set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
 install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .)
--- a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu
@ -0,0 +1,107 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 // Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
 //   event mode: using cuda event to record the elapsed time of kernel launch on device.
 //   wall mode: using host timer to record the elapsed time kernel launch on both host and device.
 #include <algorithm>
 #include <chrono>
 #include <stdio.h>
 #include <string>
 #include <sys/time.h>
 #include <thread>
 #include "cuda_runtime.h"
 __global__ void EmptyKernel() {}
 double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
    float time = 0.f;
    double total_time = 0.0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    for (int i = 0; i < num_warmups; i++) {
        cudaEventRecord(start, 0);
        EmptyKernel<<<1, 1>>>();
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
    }
    for (int i = 0; i < num_steps; i++) {
        cudaEventRecord(start, 0);
        EmptyKernel<<<1, 1>>>();
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        total_time += time;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return total_time;
 }
 double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
    double total_time = 0.0;
    for (int i = 0; i < num_warmups; i++) {
        EmptyKernel<<<1, 1>>>();
        cudaDeviceSynchronize();
    }
    struct timeval begin_tv, end_tv;
    for (int i = 0; i < num_steps; i++) {
        gettimeofday(&begin_tv, NULL);
        EmptyKernel<<<1, 1>>>();
        cudaDeviceSynchronize();
        gettimeofday(&end_tv, NULL);
        total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
                       ((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
    }
    return total_time;
 }
 char *getCmdOption(char **begin, char **end, const std::string &option) {
    char **itr = std::find(begin, end, option);
    if (itr != end && ++itr != end) {
        return *itr;
    }
    return 0;
 }
 int main(int argc, char *argv[]) {
    int num_warmups = 100;
    int num_steps = 2000000;
    int interval = 2000;
    if (char *value = getCmdOption(argv, argv + argc, "-w")) {
        num_warmups = std::stoi(value);
    }
    if (char *value = getCmdOption(argv, argv + argc, "-n")) {
        num_steps = std::stoi(value);
    }
    if (char *value = getCmdOption(argv, argv + argc, "-i")) {
        interval = std::stoi(value);
    }
    // Test the kernel launch event time.
    double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
    printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
    // Sleep for interval milliseconds and run the next test.
    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
    // Test the kernel launch wall time.
    double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
    printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
    return 0;
 }