Benchmarks: Add Benchmark - Add the source code of cuda kernel launch overhead benchmark. (#71)
* add cuda kernel launch overhead benchmark - source part. * can customize the nvcc_archs_support. * set SB_MICRO_PATH for azure pipeline tests.
This commit is contained in:
Родитель
94d3765b49
Коммит
7cfe7c16cf
|
@ -16,13 +16,13 @@ steps:
|
||||||
python3 -m pip install .[test,torch]
|
python3 -m pip install .[test,torch]
|
||||||
displayName: Install dependencies
|
displayName: Install dependencies
|
||||||
- script: |
|
- script: |
|
||||||
make cppbuild
|
SB_MICRO_PATH=$PWD/bin make cppbuild
|
||||||
displayName: Build benchmarks
|
displayName: Build benchmarks
|
||||||
- script: |
|
- script: |
|
||||||
python3 setup.py lint
|
python3 setup.py lint
|
||||||
displayName: Run code lint
|
displayName: Run code lint
|
||||||
- script: |
|
- script: |
|
||||||
python3 setup.py test
|
SB_MICRO_PATH=$PWD/bin python3 setup.py test
|
||||||
displayName: Run unit tests
|
displayName: Run unit tests
|
||||||
timeoutInMinutes: 10
|
timeoutInMinutes: 10
|
||||||
- script: |
|
- script: |
|
||||||
|
|
|
@ -2,3 +2,4 @@
|
||||||
|
|
||||||
BasedOnStyle: LLVM
|
BasedOnStyle: LLVM
|
||||||
ColumnLimit: 120
|
ColumnLimit: 120
|
||||||
|
IndentWidth: 4
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
# Copyright (c) Microsoft Corporation.
|
||||||
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
if(NOT DEFINED CMAKE_CUDA_STANDARD)
|
||||||
|
set(CMAKE_CUDA_STANDARD 11)
|
||||||
|
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
|
||||||
|
# Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
|
||||||
|
set(NVCC_ARCHS_SUPPORTED "")
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 53)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 60 61)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 70)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 72)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 75)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 80)
|
||||||
|
endif()
|
||||||
|
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1)
|
||||||
|
list(APPEND NVCC_ARCHS_SUPPORTED 86)
|
||||||
|
endif()
|
||||||
|
endif()
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Copyright (c) Microsoft Corporation.
|
||||||
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
cmake_minimum_required(VERSION 3.18)
|
||||||
|
project(kernel_launch_overhead LANGUAGES CUDA CXX)
|
||||||
|
|
||||||
|
include(../cuda_common.cmake)
|
||||||
|
|
||||||
|
add_executable(kernel_launch_overhead cuda_kernel_launch.cu)
|
||||||
|
set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
|
||||||
|
install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .)
|
|
@ -0,0 +1,107 @@
|
||||||
|
// Copyright (c) Microsoft Corporation.
|
||||||
|
// Licensed under the MIT License.
|
||||||
|
|
||||||
|
// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
|
||||||
|
// event mode: using cuda event to record the elapsed time of kernel launch on device.
|
||||||
|
// wall mode: using host timer to record the elapsed time kernel launch on both host and device.
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#include "cuda_runtime.h"
|
||||||
|
|
||||||
|
__global__ void EmptyKernel() {}
|
||||||
|
|
||||||
|
double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
|
||||||
|
float time = 0.f;
|
||||||
|
double total_time = 0.0;
|
||||||
|
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
cudaEventCreate(&start);
|
||||||
|
cudaEventCreate(&stop);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_warmups; i++) {
|
||||||
|
cudaEventRecord(start, 0);
|
||||||
|
EmptyKernel<<<1, 1>>>();
|
||||||
|
cudaEventRecord(stop, 0);
|
||||||
|
cudaEventSynchronize(stop);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_steps; i++) {
|
||||||
|
cudaEventRecord(start, 0);
|
||||||
|
EmptyKernel<<<1, 1>>>();
|
||||||
|
cudaEventRecord(stop, 0);
|
||||||
|
cudaEventSynchronize(stop);
|
||||||
|
cudaEventElapsedTime(&time, start, stop);
|
||||||
|
total_time += time;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaEventDestroy(start);
|
||||||
|
cudaEventDestroy(stop);
|
||||||
|
|
||||||
|
return total_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
|
||||||
|
double total_time = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_warmups; i++) {
|
||||||
|
EmptyKernel<<<1, 1>>>();
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct timeval begin_tv, end_tv;
|
||||||
|
for (int i = 0; i < num_steps; i++) {
|
||||||
|
gettimeofday(&begin_tv, NULL);
|
||||||
|
EmptyKernel<<<1, 1>>>();
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
gettimeofday(&end_tv, NULL);
|
||||||
|
total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
|
||||||
|
((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
return total_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *getCmdOption(char **begin, char **end, const std::string &option) {
|
||||||
|
char **itr = std::find(begin, end, option);
|
||||||
|
if (itr != end && ++itr != end) {
|
||||||
|
return *itr;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int num_warmups = 100;
|
||||||
|
int num_steps = 2000000;
|
||||||
|
int interval = 2000;
|
||||||
|
|
||||||
|
if (char *value = getCmdOption(argv, argv + argc, "-w")) {
|
||||||
|
num_warmups = std::stoi(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char *value = getCmdOption(argv, argv + argc, "-n")) {
|
||||||
|
num_steps = std::stoi(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char *value = getCmdOption(argv, argv + argc, "-i")) {
|
||||||
|
interval = std::stoi(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test the kernel launch event time.
|
||||||
|
double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
|
||||||
|
printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
|
||||||
|
|
||||||
|
// Sleep for interval milliseconds and run the next test.
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
|
||||||
|
|
||||||
|
// Test the kernel launch wall time.
|
||||||
|
double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
|
||||||
|
printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Загрузка…
Ссылка в новой задаче