Benchmarks: Revise Code - Reduce result variance in gpu_copy benchmark (#298)

**Description**
This commit does the following to optimize result variance in gpu_copy benchmark:
1) Add warmup phase for gpu_copy benchmark to avoid timing instability caused by first-time CUDA kernel launch overhead;
2) Use CUDA events for timing instead of CPU timestamps;
3) Make data checking an option that is not preferred to be enabled in performance test;
4) Enlarge message size in performance benchmark.
This commit is contained in:
Ziyue Yang 2022-02-07 13:16:13 +08:00 коммит произвёл GitHub
Родитель 28195be6db
Коммит 853890559a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 147 добавлений и 28 удалений

Просмотреть файл

@ -48,11 +48,19 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
self._parser.add_argument(
'--size',
type=int,
default=64 * 1024**2,
default=256 * 1024**2,
required=False,
help='Size of data buffer in bytes.',
)
self._parser.add_argument(
'--num_warm_up',
type=int,
default=20,
required=False,
help='Number of warm up rounds',
)
self._parser.add_argument(
'--num_loops',
type=int,
@ -78,7 +86,9 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
args = '--size %d --num_loops %d' % (self._args.size, self._args.num_loops)
args = '--size %d --num_warm_up %d --num_loops %d' % (
self._args.size, self._args.num_warm_up, self._args.num_loops
)
for mem_type in self._args.mem_type:
args += ' --%s' % mem_type
for copy_type in self._args.copy_type:

Просмотреть файл

@ -3,7 +3,6 @@
// GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA.
#include <chrono>
#include <cstdio>
#include <cstring>
#include <string>
@ -52,6 +51,12 @@ struct SubBenchArgs {
// CUDA stream to be used.
cudaStream_t stream;
// CUDA event to record start time.
cudaEvent_t start_event;
// CUDA event to record end time.
cudaEvent_t end_event;
};
// Arguments for each benchmark run.
@ -69,6 +74,9 @@ struct BenchArgs {
// Data buffer size used.
uint64_t size = 0;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 0;
@ -82,10 +90,13 @@ struct BenchArgs {
// Options accepted by this program.
struct Opts {
// Data buffer size for copy benchmark.
uint64_t size;
uint64_t size = 0;
// Data buffer size for copy benchmark.
uint64_t num_loops;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 0;
// Whether GPU SM copy needs to be evaluated.
bool sm_copy_enabled = false;
@ -110,6 +121,7 @@ struct Opts {
void PrintUsage() {
printf("Usage: gpu_copy "
"--size <size> "
"--num_warm_up <num_warm_up> "
"--num_loops <num_loops> "
"[--sm_copy] "
"[--dma_copy] "
@ -123,7 +135,8 @@ void PrintUsage() {
int ParseOpts(int argc, char **argv, Opts *opts) {
enum class OptIdx {
kSize,
kNumIters,
kNumWarmUp,
kNumLoops,
kEnableSmCopy,
kEnableDmaCopy,
kEnableHToD,
@ -133,7 +146,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
};
const struct option options[] = {
{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumIters)},
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
{"sm_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableSmCopy)},
{"dma_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDmaCopy)},
{"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)},
@ -143,12 +157,13 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
int getopt_ret = 0;
int opt_idx = 0;
bool size_specified = false;
bool num_warm_up_specified = false;
bool num_loops_specified = false;
bool parse_err = false;
while (true) {
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
if (getopt_ret == -1) {
if (!size_specified || !num_loops_specified) {
if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
parse_err = true;
}
break;
@ -165,7 +180,15 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
size_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumIters):
case static_cast<int>(OptIdx::kNumWarmUp):
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
fprintf(stderr, "Invalid num_warm_up: %s\n", optarg);
parse_err = true;
} else {
num_warm_up_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumLoops):
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
fprintf(stderr, "Invalid num_loops: %s\n", optarg);
parse_err = true;
@ -306,6 +329,28 @@ int PrepareBufAndStream(BenchArgs *args) {
return 0;
}
// Prepare events to be used.
int PrepareEvent(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
if (SetGpu(sub.worker_gpu_id)) {
return -1;
}
cuda_err = cudaEventCreate(&(sub.start_event));
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
return -1;
}
cuda_err = cudaEventCreate(&(sub.end_event));
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
return -1;
}
}
return 0;
}
// Validate the result of data transfer.
int CheckBuf(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
@ -399,7 +444,7 @@ int DestroyBufAndStream(BenchArgs *args) {
}
cuda_err = cudaStreamDestroy(sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestoryBufAndStream::cudaStreamDestroy error: %d\n", cuda_err);
fprintf(stderr, "DestroyBufAndStream::cudaStreamDestroy error: %d\n", cuda_err);
return -1;
}
}
@ -407,6 +452,28 @@ int DestroyBufAndStream(BenchArgs *args) {
return ret;
}
// Destroy events
int DestroyEvent(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
if (SetGpu(sub.worker_gpu_id)) {
return -1;
}
cuda_err = cudaEventDestroy(sub.start_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
return -1;
}
cuda_err = cudaEventDestroy(sub.end_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
return -1;
}
}
return 0;
}
// Unroll depth in SM copy kernel
#define NUM_LOOP_UNROLL 2
@ -502,20 +569,37 @@ int RunCopy(BenchArgs *args) {
}
// Launch jobs and collect running time
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < args->num_loops; i++) {
for (int i = 0; i < args->num_loops + args->num_warm_up; i++) {
for (int j = 0; j < args->num_subs; j++) {
SubBenchArgs &sub = args->subs[j];
if (SetGpu(sub.worker_gpu_id)) {
return -1;
}
if (i == args->num_warm_up) {
cuda_err = cudaEventRecord(sub.start_event, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
return -1;
}
}
if (args->is_sm_copy) {
SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, sub.stream>>>(
reinterpret_cast<ulong2 *>(sub.dst_dev_gpu_buf_ptr),
reinterpret_cast<ulong2 *>(sub.src_dev_gpu_buf_ptr));
} else {
cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size, cudaMemcpyDefault,
sub.stream);
cuda_err = cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size,
cudaMemcpyDefault, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaMemcpyAsync error: %d\n", cuda_err);
return -1;
}
}
if (i + 1 == args->num_loops + args->num_warm_up) {
cuda_err = cudaEventRecord(sub.end_event, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
return -1;
}
}
}
}
@ -527,13 +611,22 @@ int RunCopy(BenchArgs *args) {
return -1;
}
}
auto end = std::chrono::steady_clock::now();
// Calculate and display bandwidth if no problem
double time_in_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
float max_time_in_ms = 0;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
float time_in_ms = 0;
cuda_err = cudaEventElapsedTime(&time_in_ms, sub.start_event, sub.end_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventElapsedTime error: %d\n", cuda_err);
return -1;
}
max_time_in_ms = time_in_ms > max_time_in_ms ? time_in_ms : max_time_in_ms;
}
PrintResultTag(*args);
printf(" %g\n", args->size * args->num_loops * args->num_subs / time_in_sec / 1e9);
printf(" %g\n", args->size * args->num_loops * args->num_subs / max_time_in_ms / 1e6);
return 0;
}
@ -565,17 +658,28 @@ int EnablePeerAccess(int src_gpu_id, int dst_gpu_id, int *can_access) {
int RunBench(BenchArgs *args) {
int ret = 0;
int destroy_buf_ret = 0;
int destroy_ret = 0;
ret = PrepareBufAndStream(args);
if (ret == 0) {
ret = RunCopy(args);
if (ret == 0) {
ret = CheckBuf(args);
}
if (ret != 0) {
goto destroy_buf;
}
destroy_buf_ret = DestroyBufAndStream(args);
ret = PrepareEvent(args);
if (ret != 0) {
goto destroy_event;
}
ret = RunCopy(args);
if (ret == 0) {
ret = destroy_buf_ret;
ret = CheckBuf(args);
}
destroy_event:
destroy_ret = DestroyEvent(args);
if (ret == 0) {
ret = destroy_ret;
}
destroy_buf:
destroy_ret = DestroyBufAndStream(args);
if (ret == 0) {
ret = destroy_ret;
}
return ret;
}
@ -643,6 +747,7 @@ int main(int argc, char **argv) {
if (ret != 0) {
return ret;
}
args.num_warm_up = opts.num_warm_up;
args.num_loops = opts.num_loops;
args.size = opts.size;

Просмотреть файл

@ -28,12 +28,14 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
assert (benchmark_class)
size = 1048576
num_warm_up = 20
num_loops = 10000
mem_types = ['htod', 'dtoh', 'dtod']
copy_types = ['sm', 'dma']
parameters = '--mem_type %s --copy_type %s --size %d --num_loops %d --bidirectional' % \
(' '.join(mem_types), ' '.join(copy_types), size, num_loops)
parameters = '--mem_type %s --copy_type %s --size %d ' \
'--num_warm_up %d --num_loops %d --bidirectional' % \
(' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
@ -48,6 +50,7 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
assert (benchmark._args.mem_type == mem_types)
assert (benchmark._args.copy_type == copy_types)
assert (benchmark._args.size == size)
assert (benchmark._args.num_warm_up == num_warm_up)
assert (benchmark._args.num_loops == num_loops)
assert (benchmark._args.bidirectional)
@ -59,6 +62,7 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
for copy_type in copy_types:
assert ('--%s_copy' % copy_type in benchmark._commands[0])
assert ('--size %d' % size in benchmark._commands[0])
assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
assert ('--num_loops %d' % num_loops in benchmark._commands[0])
assert ('--bidirectional' in benchmark._commands[0])