Benchmarks: Revise Code - Reduce result variance in gpu_copy benchmark (#298)
**Description** This commit does the following to optimize result variance in gpu_copy benchmark: 1) Add warmup phase for gpu_copy benchmark to avoid timing instability caused by first-time CUDA kernel launch overhead; 2) Use CUDA events for timing instead of CPU timestamps; 3) Make data checking an option that is not preferred to be enabled in performance test; 4) Enlarge message size in performance benchmark.
This commit is contained in:
Родитель
28195be6db
Коммит
853890559a
|
@ -48,11 +48,19 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
|||
self._parser.add_argument(
|
||||
'--size',
|
||||
type=int,
|
||||
default=64 * 1024**2,
|
||||
default=256 * 1024**2,
|
||||
required=False,
|
||||
help='Size of data buffer in bytes.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--num_warm_up',
|
||||
type=int,
|
||||
default=20,
|
||||
required=False,
|
||||
help='Number of warm up rounds',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--num_loops',
|
||||
type=int,
|
||||
|
@ -78,7 +86,9 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
|||
|
||||
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
|
||||
|
||||
args = '--size %d --num_loops %d' % (self._args.size, self._args.num_loops)
|
||||
args = '--size %d --num_warm_up %d --num_loops %d' % (
|
||||
self._args.size, self._args.num_warm_up, self._args.num_loops
|
||||
)
|
||||
for mem_type in self._args.mem_type:
|
||||
args += ' --%s' % mem_type
|
||||
for copy_type in self._args.copy_type:
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
// GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA.
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
@ -52,6 +51,12 @@ struct SubBenchArgs {
|
|||
|
||||
// CUDA stream to be used.
|
||||
cudaStream_t stream;
|
||||
|
||||
// CUDA event to record start time.
|
||||
cudaEvent_t start_event;
|
||||
|
||||
// CUDA event to record end time.
|
||||
cudaEvent_t end_event;
|
||||
};
|
||||
|
||||
// Arguments for each benchmark run.
|
||||
|
@ -69,6 +74,9 @@ struct BenchArgs {
|
|||
// Data buffer size used.
|
||||
uint64_t size = 0;
|
||||
|
||||
// Number of warm up rounds to run.
|
||||
uint64_t num_warm_up = 0;
|
||||
|
||||
// Number of loops to run.
|
||||
uint64_t num_loops = 0;
|
||||
|
||||
|
@ -82,10 +90,13 @@ struct BenchArgs {
|
|||
// Options accepted by this program.
|
||||
struct Opts {
|
||||
// Data buffer size for copy benchmark.
|
||||
uint64_t size;
|
||||
uint64_t size = 0;
|
||||
|
||||
// Data buffer size for copy benchmark.
|
||||
uint64_t num_loops;
|
||||
// Number of warm up rounds to run.
|
||||
uint64_t num_warm_up = 0;
|
||||
|
||||
// Number of loops to run.
|
||||
uint64_t num_loops = 0;
|
||||
|
||||
// Whether GPU SM copy needs to be evaluated.
|
||||
bool sm_copy_enabled = false;
|
||||
|
@ -110,6 +121,7 @@ struct Opts {
|
|||
void PrintUsage() {
|
||||
printf("Usage: gpu_copy "
|
||||
"--size <size> "
|
||||
"--num_warm_up <num_warm_up> "
|
||||
"--num_loops <num_loops> "
|
||||
"[--sm_copy] "
|
||||
"[--dma_copy] "
|
||||
|
@ -123,7 +135,8 @@ void PrintUsage() {
|
|||
int ParseOpts(int argc, char **argv, Opts *opts) {
|
||||
enum class OptIdx {
|
||||
kSize,
|
||||
kNumIters,
|
||||
kNumWarmUp,
|
||||
kNumLoops,
|
||||
kEnableSmCopy,
|
||||
kEnableDmaCopy,
|
||||
kEnableHToD,
|
||||
|
@ -133,7 +146,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
};
|
||||
const struct option options[] = {
|
||||
{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
|
||||
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumIters)},
|
||||
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
|
||||
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
|
||||
{"sm_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableSmCopy)},
|
||||
{"dma_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDmaCopy)},
|
||||
{"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)},
|
||||
|
@ -143,12 +157,13 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
int getopt_ret = 0;
|
||||
int opt_idx = 0;
|
||||
bool size_specified = false;
|
||||
bool num_warm_up_specified = false;
|
||||
bool num_loops_specified = false;
|
||||
bool parse_err = false;
|
||||
while (true) {
|
||||
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
|
||||
if (getopt_ret == -1) {
|
||||
if (!size_specified || !num_loops_specified) {
|
||||
if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
|
||||
parse_err = true;
|
||||
}
|
||||
break;
|
||||
|
@ -165,7 +180,15 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
size_specified = true;
|
||||
}
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kNumIters):
|
||||
case static_cast<int>(OptIdx::kNumWarmUp):
|
||||
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
|
||||
fprintf(stderr, "Invalid num_warm_up: %s\n", optarg);
|
||||
parse_err = true;
|
||||
} else {
|
||||
num_warm_up_specified = true;
|
||||
}
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kNumLoops):
|
||||
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
|
||||
fprintf(stderr, "Invalid num_loops: %s\n", optarg);
|
||||
parse_err = true;
|
||||
|
@ -306,6 +329,28 @@ int PrepareBufAndStream(BenchArgs *args) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// Prepare events to be used.
|
||||
int PrepareEvent(BenchArgs *args) {
|
||||
cudaError_t cuda_err = cudaSuccess;
|
||||
for (int i = 0; i < args->num_subs; i++) {
|
||||
SubBenchArgs &sub = args->subs[i];
|
||||
if (SetGpu(sub.worker_gpu_id)) {
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaEventCreate(&(sub.start_event));
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaEventCreate(&(sub.end_event));
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Validate the result of data transfer.
|
||||
int CheckBuf(BenchArgs *args) {
|
||||
cudaError_t cuda_err = cudaSuccess;
|
||||
|
@ -399,7 +444,7 @@ int DestroyBufAndStream(BenchArgs *args) {
|
|||
}
|
||||
cuda_err = cudaStreamDestroy(sub.stream);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "DestoryBufAndStream::cudaStreamDestroy error: %d\n", cuda_err);
|
||||
fprintf(stderr, "DestroyBufAndStream::cudaStreamDestroy error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
@ -407,6 +452,28 @@ int DestroyBufAndStream(BenchArgs *args) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
// Destroy events
|
||||
int DestroyEvent(BenchArgs *args) {
|
||||
cudaError_t cuda_err = cudaSuccess;
|
||||
for (int i = 0; i < args->num_subs; i++) {
|
||||
SubBenchArgs &sub = args->subs[i];
|
||||
if (SetGpu(sub.worker_gpu_id)) {
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaEventDestroy(sub.start_event);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaEventDestroy(sub.end_event);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Unroll depth in SM copy kernel
|
||||
#define NUM_LOOP_UNROLL 2
|
||||
|
||||
|
@ -502,20 +569,37 @@ int RunCopy(BenchArgs *args) {
|
|||
}
|
||||
|
||||
// Launch jobs and collect running time
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
for (int i = 0; i < args->num_loops; i++) {
|
||||
for (int i = 0; i < args->num_loops + args->num_warm_up; i++) {
|
||||
for (int j = 0; j < args->num_subs; j++) {
|
||||
SubBenchArgs &sub = args->subs[j];
|
||||
if (SetGpu(sub.worker_gpu_id)) {
|
||||
return -1;
|
||||
}
|
||||
if (i == args->num_warm_up) {
|
||||
cuda_err = cudaEventRecord(sub.start_event, sub.stream);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (args->is_sm_copy) {
|
||||
SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, sub.stream>>>(
|
||||
reinterpret_cast<ulong2 *>(sub.dst_dev_gpu_buf_ptr),
|
||||
reinterpret_cast<ulong2 *>(sub.src_dev_gpu_buf_ptr));
|
||||
} else {
|
||||
cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size, cudaMemcpyDefault,
|
||||
sub.stream);
|
||||
cuda_err = cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size,
|
||||
cudaMemcpyDefault, sub.stream);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunCopy::cudaMemcpyAsync error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (i + 1 == args->num_loops + args->num_warm_up) {
|
||||
cuda_err = cudaEventRecord(sub.end_event, sub.stream);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -527,13 +611,22 @@ int RunCopy(BenchArgs *args) {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
|
||||
// Calculate and display bandwidth if no problem
|
||||
double time_in_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
|
||||
float max_time_in_ms = 0;
|
||||
for (int i = 0; i < args->num_subs; i++) {
|
||||
SubBenchArgs &sub = args->subs[i];
|
||||
float time_in_ms = 0;
|
||||
cuda_err = cudaEventElapsedTime(&time_in_ms, sub.start_event, sub.end_event);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunCopy::cudaEventElapsedTime error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
max_time_in_ms = time_in_ms > max_time_in_ms ? time_in_ms : max_time_in_ms;
|
||||
}
|
||||
|
||||
PrintResultTag(*args);
|
||||
printf(" %g\n", args->size * args->num_loops * args->num_subs / time_in_sec / 1e9);
|
||||
printf(" %g\n", args->size * args->num_loops * args->num_subs / max_time_in_ms / 1e6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -565,17 +658,28 @@ int EnablePeerAccess(int src_gpu_id, int dst_gpu_id, int *can_access) {
|
|||
|
||||
int RunBench(BenchArgs *args) {
|
||||
int ret = 0;
|
||||
int destroy_buf_ret = 0;
|
||||
int destroy_ret = 0;
|
||||
ret = PrepareBufAndStream(args);
|
||||
if (ret == 0) {
|
||||
ret = RunCopy(args);
|
||||
if (ret == 0) {
|
||||
ret = CheckBuf(args);
|
||||
}
|
||||
if (ret != 0) {
|
||||
goto destroy_buf;
|
||||
}
|
||||
destroy_buf_ret = DestroyBufAndStream(args);
|
||||
ret = PrepareEvent(args);
|
||||
if (ret != 0) {
|
||||
goto destroy_event;
|
||||
}
|
||||
ret = RunCopy(args);
|
||||
if (ret == 0) {
|
||||
ret = destroy_buf_ret;
|
||||
ret = CheckBuf(args);
|
||||
}
|
||||
destroy_event:
|
||||
destroy_ret = DestroyEvent(args);
|
||||
if (ret == 0) {
|
||||
ret = destroy_ret;
|
||||
}
|
||||
destroy_buf:
|
||||
destroy_ret = DestroyBufAndStream(args);
|
||||
if (ret == 0) {
|
||||
ret = destroy_ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -643,6 +747,7 @@ int main(int argc, char **argv) {
|
|||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
args.num_warm_up = opts.num_warm_up;
|
||||
args.num_loops = opts.num_loops;
|
||||
args.size = opts.size;
|
||||
|
||||
|
|
|
@ -28,12 +28,14 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
assert (benchmark_class)
|
||||
|
||||
size = 1048576
|
||||
num_warm_up = 20
|
||||
num_loops = 10000
|
||||
mem_types = ['htod', 'dtoh', 'dtod']
|
||||
copy_types = ['sm', 'dma']
|
||||
|
||||
parameters = '--mem_type %s --copy_type %s --size %d --num_loops %d --bidirectional' % \
|
||||
(' '.join(mem_types), ' '.join(copy_types), size, num_loops)
|
||||
parameters = '--mem_type %s --copy_type %s --size %d ' \
|
||||
'--num_warm_up %d --num_loops %d --bidirectional' % \
|
||||
(' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
|
||||
benchmark = benchmark_class(benchmark_name, parameters=parameters)
|
||||
|
||||
# Check basic information
|
||||
|
@ -48,6 +50,7 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
assert (benchmark._args.mem_type == mem_types)
|
||||
assert (benchmark._args.copy_type == copy_types)
|
||||
assert (benchmark._args.size == size)
|
||||
assert (benchmark._args.num_warm_up == num_warm_up)
|
||||
assert (benchmark._args.num_loops == num_loops)
|
||||
assert (benchmark._args.bidirectional)
|
||||
|
||||
|
@ -59,6 +62,7 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
for copy_type in copy_types:
|
||||
assert ('--%s_copy' % copy_type in benchmark._commands[0])
|
||||
assert ('--size %d' % size in benchmark._commands[0])
|
||||
assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
|
||||
assert ('--num_loops %d' % num_loops in benchmark._commands[0])
|
||||
assert ('--bidirectional' in benchmark._commands[0])
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче