Benchmarks: Add Benchmark - Add the source code of cuda kernel launch overhead benchmark. (#71)
* add cuda kernel launch overhead benchmark - source part. * can customize the nvcc_archs_support. * set SB_MICRO_PATH for azure pipeline tests.
This commit is contained in:
Родитель
94d3765b49
Коммит
7cfe7c16cf
|
@ -16,13 +16,13 @@ steps:
|
|||
python3 -m pip install .[test,torch]
|
||||
displayName: Install dependencies
|
||||
- script: |
|
||||
make cppbuild
|
||||
SB_MICRO_PATH=$PWD/bin make cppbuild
|
||||
displayName: Build benchmarks
|
||||
- script: |
|
||||
python3 setup.py lint
|
||||
displayName: Run code lint
|
||||
- script: |
|
||||
python3 setup.py test
|
||||
SB_MICRO_PATH=$PWD/bin python3 setup.py test
|
||||
displayName: Run unit tests
|
||||
timeoutInMinutes: 10
|
||||
- script: |
|
||||
|
|
|
@ -2,3 +2,4 @@
|
|||
|
||||
BasedOnStyle: LLVM
|
||||
ColumnLimit: 120
|
||||
IndentWidth: 4
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
if(NOT DEFINED CMAKE_CUDA_STANDARD)
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
|
||||
# Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
|
||||
set(NVCC_ARCHS_SUPPORTED "")
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 53)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 60 61)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 70)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 72)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 75)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 80)
|
||||
endif()
|
||||
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1)
|
||||
list(APPEND NVCC_ARCHS_SUPPORTED 86)
|
||||
endif()
|
||||
endif()
|
|
@ -0,0 +1,11 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(kernel_launch_overhead LANGUAGES CUDA CXX)
|
||||
|
||||
include(../cuda_common.cmake)
|
||||
|
||||
add_executable(kernel_launch_overhead cuda_kernel_launch.cu)
|
||||
set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
|
||||
install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .)
|
|
@ -0,0 +1,107 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
|
||||
// event mode: using cuda event to record the elapsed time of kernel launch on device.
|
||||
// wall mode: using host timer to record the elapsed time kernel launch on both host and device.
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <sys/time.h>
|
||||
#include <thread>
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
__global__ void EmptyKernel() {}
|
||||
|
||||
double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
|
||||
float time = 0.f;
|
||||
double total_time = 0.0;
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
for (int i = 0; i < num_warmups; i++) {
|
||||
cudaEventRecord(start, 0);
|
||||
EmptyKernel<<<1, 1>>>();
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_steps; i++) {
|
||||
cudaEventRecord(start, 0);
|
||||
EmptyKernel<<<1, 1>>>();
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
cudaEventElapsedTime(&time, start, stop);
|
||||
total_time += time;
|
||||
}
|
||||
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
|
||||
return total_time;
|
||||
}
|
||||
|
||||
double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
|
||||
double total_time = 0.0;
|
||||
|
||||
for (int i = 0; i < num_warmups; i++) {
|
||||
EmptyKernel<<<1, 1>>>();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
struct timeval begin_tv, end_tv;
|
||||
for (int i = 0; i < num_steps; i++) {
|
||||
gettimeofday(&begin_tv, NULL);
|
||||
EmptyKernel<<<1, 1>>>();
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&end_tv, NULL);
|
||||
total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
|
||||
((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
|
||||
}
|
||||
|
||||
return total_time;
|
||||
}
|
||||
|
||||
char *getCmdOption(char **begin, char **end, const std::string &option) {
|
||||
char **itr = std::find(begin, end, option);
|
||||
if (itr != end && ++itr != end) {
|
||||
return *itr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int num_warmups = 100;
|
||||
int num_steps = 2000000;
|
||||
int interval = 2000;
|
||||
|
||||
if (char *value = getCmdOption(argv, argv + argc, "-w")) {
|
||||
num_warmups = std::stoi(value);
|
||||
}
|
||||
|
||||
if (char *value = getCmdOption(argv, argv + argc, "-n")) {
|
||||
num_steps = std::stoi(value);
|
||||
}
|
||||
|
||||
if (char *value = getCmdOption(argv, argv + argc, "-i")) {
|
||||
interval = std::stoi(value);
|
||||
}
|
||||
|
||||
// Test the kernel launch event time.
|
||||
double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
|
||||
printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
|
||||
|
||||
// Sleep for interval milliseconds and run the next test.
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
|
||||
|
||||
// Test the kernel launch wall time.
|
||||
double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
|
||||
printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
|
||||
|
||||
return 0;
|
||||
}
|
Загрузка…
Ссылка в новой задаче