diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 2f9642d1..6468aa07 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -16,13 +16,13 @@ steps: python3 -m pip install .[test,torch] displayName: Install dependencies - script: | - make cppbuild + SB_MICRO_PATH=$PWD/bin make cppbuild displayName: Build benchmarks - script: | python3 setup.py lint displayName: Run code lint - script: | - python3 setup.py test + SB_MICRO_PATH=$PWD/bin python3 setup.py test displayName: Run unit tests timeoutInMinutes: 10 - script: | diff --git a/.clang-format b/.clang-format index 80d11fef..434d4899 100644 --- a/.clang-format +++ b/.clang-format @@ -2,3 +2,4 @@ BasedOnStyle: LLVM ColumnLimit: 120 +IndentWidth: 4 diff --git a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake new file mode 100644 index 00000000..1e777312 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() + +if(NOT DEFINED NVCC_ARCHS_SUPPORTED) + # Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89 + set(NVCC_ARCHS_SUPPORTED "") + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5) + list(APPEND NVCC_ARCHS_SUPPORTED 53) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0) + list(APPEND NVCC_ARCHS_SUPPORTED 60 61) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0) + list(APPEND NVCC_ARCHS_SUPPORTED 70) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2) + list(APPEND NVCC_ARCHS_SUPPORTED 72) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0) + list(APPEND NVCC_ARCHS_SUPPORTED 75) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0) + list(APPEND NVCC_ARCHS_SUPPORTED 80) + endif() + if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1) + list(APPEND NVCC_ARCHS_SUPPORTED 86) + endif() +endif() diff --git a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt new file mode 100644 index 00000000..77641068 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.18) +project(kernel_launch_overhead LANGUAGES CUDA CXX) + +include(../cuda_common.cmake) + +add_executable(kernel_launch_overhead cuda_kernel_launch.cu) +set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) +install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .) diff --git a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu new file mode 100644 index 00000000..46e4d203 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode. +// event mode: using cuda event to record the elapsed time of kernel launch on device. +// wall mode: using host timer to record the elapsed time kernel launch on both host and device. + +#include +#include +#include +#include +#include +#include + +#include "cuda_runtime.h" + +__global__ void EmptyKernel() {} + +double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) { + float time = 0.f; + double total_time = 0.0; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for (int i = 0; i < num_warmups; i++) { + cudaEventRecord(start, 0); + EmptyKernel<<<1, 1>>>(); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + } + + for (int i = 0; i < num_steps; i++) { + cudaEventRecord(start, 0); + EmptyKernel<<<1, 1>>>(); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + total_time += time; + } + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return total_time; +} + +double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) { + double total_time = 0.0; + + for (int i = 0; i < num_warmups; i++) { + EmptyKernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + } + + struct timeval begin_tv, end_tv; + for (int i = 0; i < num_steps; i++) { + gettimeofday(&begin_tv, NULL); + EmptyKernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + gettimeofday(&end_tv, NULL); + total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) - + ((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000)); + } + + return total_time; +} + +char *getCmdOption(char **begin, char **end, const std::string &option) { + char **itr = std::find(begin, end, option); + if (itr != end && ++itr != end) { + return *itr; + } + return 0; +} + +int main(int argc, char *argv[]) { + int num_warmups = 100; + int num_steps = 2000000; + int interval = 2000; + + if (char *value = getCmdOption(argv, argv + argc, "-w")) { + num_warmups = std::stoi(value); + } + + if (char *value = getCmdOption(argv, argv + argc, "-n")) { + num_steps = std::stoi(value); + } + + if (char *value = getCmdOption(argv, argv + argc, "-i")) { + interval = std::stoi(value); + } + + // Test the kernel launch event time. + double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps); + printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps); + + // Sleep for interval milliseconds and run the next test. + std::this_thread::sleep_for(std::chrono::milliseconds(interval)); + + // Test the kernel launch wall time. + double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps); + printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps); + + return 0; +}