Benchmarks: Add Benchmark - Add the source code of cuda kernel launch overhead benchmark. (#71)

* add cuda kernel launch overhead benchmark - source part.
* can customize the nvcc_archs_support.
* set SB_MICRO_PATH for azure pipeline tests.
This commit is contained in:
guoshzhao 2021-05-18 14:07:27 +08:00 коммит произвёл GitHub
Родитель 94d3765b49
Коммит 7cfe7c16cf
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 154 добавлений и 2 удалений

Просмотреть файл

@ -16,13 +16,13 @@ steps:
python3 -m pip install .[test,torch] python3 -m pip install .[test,torch]
displayName: Install dependencies displayName: Install dependencies
- script: | - script: |
make cppbuild SB_MICRO_PATH=$PWD/bin make cppbuild
displayName: Build benchmarks displayName: Build benchmarks
- script: | - script: |
python3 setup.py lint python3 setup.py lint
displayName: Run code lint displayName: Run code lint
- script: | - script: |
python3 setup.py test SB_MICRO_PATH=$PWD/bin python3 setup.py test
displayName: Run unit tests displayName: Run unit tests
timeoutInMinutes: 10 timeoutInMinutes: 10
- script: | - script: |

Просмотреть файл

@ -2,3 +2,4 @@
BasedOnStyle: LLVM BasedOnStyle: LLVM
ColumnLimit: 120 ColumnLimit: 120
IndentWidth: 4

Просмотреть файл

@ -0,0 +1,33 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
if(NOT DEFINED CMAKE_CUDA_STANDARD)
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
endif()
if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
# Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
set(NVCC_ARCHS_SUPPORTED "")
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5)
list(APPEND NVCC_ARCHS_SUPPORTED 53)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0)
list(APPEND NVCC_ARCHS_SUPPORTED 60 61)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0)
list(APPEND NVCC_ARCHS_SUPPORTED 70)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
list(APPEND NVCC_ARCHS_SUPPORTED 72)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0)
list(APPEND NVCC_ARCHS_SUPPORTED 75)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
list(APPEND NVCC_ARCHS_SUPPORTED 80)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1)
list(APPEND NVCC_ARCHS_SUPPORTED 86)
endif()
endif()

Просмотреть файл

@ -0,0 +1,11 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
project(kernel_launch_overhead LANGUAGES CUDA CXX)
include(../cuda_common.cmake)
add_executable(kernel_launch_overhead cuda_kernel_launch.cu)
set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .)

Просмотреть файл

@ -0,0 +1,107 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
// event mode: using cuda event to record the elapsed time of kernel launch on device.
// wall mode: using host timer to record the elapsed time kernel launch on both host and device.
#include <algorithm>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sys/time.h>
#include <thread>
#include "cuda_runtime.h"
__global__ void EmptyKernel() {}
double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
float time = 0.f;
double total_time = 0.0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (int i = 0; i < num_warmups; i++) {
cudaEventRecord(start, 0);
EmptyKernel<<<1, 1>>>();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
for (int i = 0; i < num_steps; i++) {
cudaEventRecord(start, 0);
EmptyKernel<<<1, 1>>>();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
total_time += time;
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
return total_time;
}
double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
double total_time = 0.0;
for (int i = 0; i < num_warmups; i++) {
EmptyKernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
struct timeval begin_tv, end_tv;
for (int i = 0; i < num_steps; i++) {
gettimeofday(&begin_tv, NULL);
EmptyKernel<<<1, 1>>>();
cudaDeviceSynchronize();
gettimeofday(&end_tv, NULL);
total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
}
return total_time;
}
char *getCmdOption(char **begin, char **end, const std::string &option) {
char **itr = std::find(begin, end, option);
if (itr != end && ++itr != end) {
return *itr;
}
return 0;
}
int main(int argc, char *argv[]) {
int num_warmups = 100;
int num_steps = 2000000;
int interval = 2000;
if (char *value = getCmdOption(argv, argv + argc, "-w")) {
num_warmups = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-n")) {
num_steps = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-i")) {
interval = std::stoi(value);
}
// Test the kernel launch event time.
double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
// Sleep for interval milliseconds and run the next test.
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
// Test the kernel launch wall time.
double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
return 0;
}