Benchmarks: Micro benchmark - Add one-to-all, all-to-one, all-to-all support to gpu_copy_bw_performance (#588)
**Description** Add one-to-all, all-to-one, all-to-all support to gpu_copy_bw_performance, and fix performance bug in gpu_copy
This commit is contained in:
Родитель
6ef3a0110f
Коммит
4fa60be7cd
|
@ -256,6 +256,9 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi
|
|||
| gpu[0-9]+\_and\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | Same as above, but generated by --dtoh --bidirectional. |
|
||||
| gpu[0-9]+\_and\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_and\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| gpu[0-9]+\_to\_gpu\_all\_write\_by\_sm\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing all peer GPUs' memory using GPU SM with peer communication enabled. |
|
||||
| gpu\_all\_to\_gpu[0-9]+\_write\_by\_sm\_bw | bandwidth (GB/s) | The unidirectional bandwidth of all peer GPUs writing one GPU's memory using GPU SM with peer communication enabled. |
|
||||
| gpu\_all\_to\_gpu\_all\_write\_by\_sm\_bw | bandwidth (GB/s) | The unidirectional bandwidth of all peer GPUs writing all peer GPUs' memory using GPU SM with peer communication enabled. |
|
||||
|
||||
### `ib-loopback`
|
||||
|
||||
|
|
|
@ -12,7 +12,9 @@ from superbench.common.utils import logger
|
|||
|
||||
if __name__ == '__main__':
|
||||
context = BenchmarkRegistry.create_benchmark_context(
|
||||
'gpu-copy-bw', platform=Platform.CUDA, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
|
||||
'gpu-copy-bw',
|
||||
platform=Platform.CUDA,
|
||||
parameters='--mem_type htod dtoh dtod one_to_all all_to_one all_to_all --copy_type sm dma'
|
||||
)
|
||||
# For ROCm environment, please specify the benchmark name and the platform as the following.
|
||||
# context = BenchmarkRegistry.create_benchmark_context(
|
||||
|
|
|
@ -22,7 +22,7 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
|||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'gpu_copy'
|
||||
self._mem_types = ['htod', 'dtoh', 'dtod']
|
||||
self._mem_types = ['htod', 'dtoh', 'dtod', 'one_to_all', 'all_to_one', 'all_to_all']
|
||||
self._copy_types = ['sm', 'dma']
|
||||
|
||||
def add_parser_arguments(self):
|
||||
|
@ -69,6 +69,22 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
|||
help='Number of data buffer copies performed.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--all_to_all_num_thread_blocks_per_rank',
|
||||
type=int,
|
||||
default=0,
|
||||
required=False,
|
||||
help='Number of thread blocks per rank in one-to-all/all-to-one/all-to-all tests.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--all_to_all_thread_block_size',
|
||||
type=int,
|
||||
default=0,
|
||||
required=False,
|
||||
help='Thread block size in one-to-all/all-to-one/all-to-all tests.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--bidirectional',
|
||||
action='store_true',
|
||||
|
@ -95,6 +111,13 @@ class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
|||
args = '--size %d --num_warm_up %d --num_loops %d' % (
|
||||
self._args.size, self._args.num_warm_up, self._args.num_loops
|
||||
)
|
||||
|
||||
if self._args.all_to_all_num_thread_blocks_per_rank > 0:
|
||||
args += ' --all_to_all_num_thread_blocks_per_rank %d' % self._args.all_to_all_num_thread_blocks_per_rank
|
||||
|
||||
if self._args.all_to_all_thread_block_size > 0:
|
||||
args += ' --all_to_all_thread_block_size %d' % self._args.all_to_all_thread_block_size
|
||||
|
||||
for mem_type in self._args.mem_type:
|
||||
args += ' --%s' % mem_type
|
||||
for copy_type in self._args.copy_type:
|
||||
|
|
|
@ -116,9 +116,24 @@ struct Opts {
|
|||
// Whether device-to-device transfer needs to be evaluated.
|
||||
bool dtod_enabled = false;
|
||||
|
||||
// Whether one-to-all (device) transfer needs to be evaluated.
|
||||
bool one_to_all_enabled = false;
|
||||
|
||||
// Whether all-to-one (device) transfer needs to be evaluated.
|
||||
bool all_to_one_enabled = false;
|
||||
|
||||
// Whether all-to-all (device) transfer needs to be evaluated.
|
||||
bool all_to_all_enabled = false;
|
||||
|
||||
// Whether bidirectional transfer is enabled.
|
||||
bool bidirectional_enabled = false;
|
||||
|
||||
// Number of thread blocks per rank in one-to-all/all-to-one/all-to-all tests.
|
||||
uint64_t all_to_all_num_thread_blocks_per_rank = 8;
|
||||
|
||||
// Thread block size in one-to-all/all-to-one/all-to-all tests.
|
||||
uint64_t all_to_all_thread_block_size = 512;
|
||||
|
||||
// Whether check data after copy.
|
||||
bool check_data = false;
|
||||
};
|
||||
|
@ -129,11 +144,16 @@ void PrintUsage() {
|
|||
"--size <size> "
|
||||
"--num_warm_up <num_warm_up> "
|
||||
"--num_loops <num_loops> "
|
||||
"[--all_to_all_num_thread_blocks_per_rank <all_to_all_num_thread_blocks_per_rank>] "
|
||||
"[--all_to_all_thread_block_size <all_to_all_thread_block_size>] "
|
||||
"[--sm_copy] "
|
||||
"[--dma_copy] "
|
||||
"[--htod] "
|
||||
"[--dtoh] "
|
||||
"[--dtod] "
|
||||
"[--one_to_all] "
|
||||
"[--all_to_one] "
|
||||
"[--all_to_all] "
|
||||
"[--bidirectional] "
|
||||
"[--check_data]\n");
|
||||
}
|
||||
|
@ -144,11 +164,16 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
kSize,
|
||||
kNumWarmUp,
|
||||
kNumLoops,
|
||||
kAllToAllNumThreadBlocksPerRank,
|
||||
kAllToAllThreadBlockSize,
|
||||
kEnableSmCopy,
|
||||
kEnableDmaCopy,
|
||||
kEnableHToD,
|
||||
kEnableDToH,
|
||||
kEnableDToD,
|
||||
kEnableOneToAll,
|
||||
kEnableAllToOne,
|
||||
kEnableAllToAll,
|
||||
kEnableBidirectional,
|
||||
kEnableCheckData
|
||||
};
|
||||
|
@ -156,11 +181,18 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
|
||||
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
|
||||
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
|
||||
{"all_to_all_num_thread_blocks_per_rank", required_argument, nullptr,
|
||||
static_cast<int>(OptIdx::kAllToAllNumThreadBlocksPerRank)},
|
||||
{"all_to_all_thread_block_size", required_argument, nullptr,
|
||||
static_cast<int>(OptIdx::kAllToAllThreadBlockSize)},
|
||||
{"sm_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableSmCopy)},
|
||||
{"dma_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDmaCopy)},
|
||||
{"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)},
|
||||
{"dtoh", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDToH)},
|
||||
{"dtod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDToD)},
|
||||
{"one_to_all", no_argument, nullptr, static_cast<int>(OptIdx::kEnableOneToAll)},
|
||||
{"all_to_one", no_argument, nullptr, static_cast<int>(OptIdx::kEnableAllToOne)},
|
||||
{"all_to_all", no_argument, nullptr, static_cast<int>(OptIdx::kEnableAllToAll)},
|
||||
{"bidirectional", no_argument, nullptr, static_cast<int>(OptIdx::kEnableBidirectional)},
|
||||
{"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
|
||||
int getopt_ret = 0;
|
||||
|
@ -205,6 +237,18 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
num_loops_specified = true;
|
||||
}
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kAllToAllNumThreadBlocksPerRank):
|
||||
if (1 != sscanf(optarg, "%lu", &(opts->all_to_all_num_thread_blocks_per_rank))) {
|
||||
fprintf(stderr, "Invalid all_to_all_num_thread_blocks_per_rank: %s\n", optarg);
|
||||
parse_err = true;
|
||||
}
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kAllToAllThreadBlockSize):
|
||||
if (1 != sscanf(optarg, "%lu", &(opts->all_to_all_thread_block_size))) {
|
||||
fprintf(stderr, "Invalid all_to_all_thread_block_size: %s\n", optarg);
|
||||
parse_err = true;
|
||||
}
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kEnableSmCopy):
|
||||
opts->sm_copy_enabled = true;
|
||||
break;
|
||||
|
@ -220,6 +264,15 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
|
|||
case static_cast<int>(OptIdx::kEnableDToD):
|
||||
opts->dtod_enabled = true;
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kEnableOneToAll):
|
||||
opts->one_to_all_enabled = true;
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kEnableAllToOne):
|
||||
opts->all_to_one_enabled = true;
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kEnableAllToAll):
|
||||
opts->all_to_all_enabled = true;
|
||||
break;
|
||||
case static_cast<int>(OptIdx::kEnableBidirectional):
|
||||
opts->bidirectional_enabled = true;
|
||||
break;
|
||||
|
@ -268,10 +321,10 @@ int PrepareBufAndStream(BenchArgs *args) {
|
|||
for (int i = 0; i < args->num_subs; i++) {
|
||||
SubBenchArgs &sub = args->subs[i];
|
||||
|
||||
// Generate data to copy
|
||||
sub.data_buf = static_cast<uint8_t *>(numa_alloc_onnode(args->size, args->numa_id));
|
||||
|
||||
if (args->check_data) {
|
||||
// Generate data to copy
|
||||
sub.data_buf = static_cast<uint8_t *>(numa_alloc_onnode(args->size, args->numa_id));
|
||||
|
||||
for (int j = 0; j < args->size; j++) {
|
||||
sub.data_buf[j] = static_cast<uint8_t>(j % uint8_mod);
|
||||
}
|
||||
|
@ -318,16 +371,18 @@ int PrepareBufAndStream(BenchArgs *args) {
|
|||
}
|
||||
|
||||
// Initialize source buffer
|
||||
if (sub.is_src_dev_gpu) {
|
||||
if (SetGpu(sub.src_gpu_id)) {
|
||||
if (args->check_data) {
|
||||
if (sub.is_src_dev_gpu) {
|
||||
if (SetGpu(sub.src_gpu_id)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
cuda_err = cudaMemcpy(sub.src_dev_gpu_buf_ptr, sub.data_buf, args->size, cudaMemcpyDefault);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "PrepareBufAndStream::cudaMemcpy error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
cuda_err = cudaMemcpy(sub.src_dev_gpu_buf_ptr, sub.data_buf, args->size, cudaMemcpyDefault);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "PrepareBufAndStream::cudaMemcpy error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Initialize stream on worker device
|
||||
if (SetGpu(sub.worker_gpu_id)) {
|
||||
|
@ -638,7 +693,12 @@ int RunCopy(BenchArgs *args) {
|
|||
}
|
||||
|
||||
PrintResultTag(*args);
|
||||
printf(" %g\n", args->size * args->num_loops * args->num_subs / max_time_in_ms / 1e6);
|
||||
double bw = args->size * args->num_loops * args->num_subs / max_time_in_ms / 1e6;
|
||||
if (args->subs[0].is_src_dev_gpu && args->subs[0].is_dst_dev_gpu &&
|
||||
args->subs[0].src_gpu_id == args->subs[0].dst_gpu_id) {
|
||||
bw *= 2.0;
|
||||
}
|
||||
printf(" %g\n", bw);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -746,6 +806,269 @@ void SetSubBenchArgsForDToD(int src_gpu_id, int dst_gpu_id, bool is_read, bool i
|
|||
}
|
||||
}
|
||||
|
||||
// dst_rank: < 0 for all ranks, else for specified rank
|
||||
__global__ void SMOneToAllCopyKernel(ulong2 **dst_buffers, ulong2 *src_buffer, uint64_t msg_size, int src_rank,
|
||||
int dst_rank, int gpu_count) {
|
||||
uint64_t size_per_dst_rank = msg_size / gpu_count;
|
||||
uint64_t num_blocks_per_dst_rank = gridDim.x / gpu_count;
|
||||
if (dst_rank >= 0 && dst_rank != blockIdx.x / num_blocks_per_dst_rank) {
|
||||
return;
|
||||
} else {
|
||||
dst_rank = blockIdx.x / num_blocks_per_dst_rank;
|
||||
}
|
||||
if (src_rank == dst_rank) {
|
||||
return;
|
||||
}
|
||||
ulong2 *dst_buffer = dst_buffers[dst_rank];
|
||||
uint64_t src_rank_elem_offset = uint64_t(src_rank) * size_per_dst_rank / sizeof(ulong2);
|
||||
uint64_t dst_rank_elem_offset = uint64_t(dst_rank) * size_per_dst_rank / sizeof(ulong2);
|
||||
ulong2 val;
|
||||
|
||||
uint64_t size_per_block = msg_size / gridDim.x;
|
||||
uint64_t num_elems_per_block = size_per_block / sizeof(ulong2);
|
||||
for (uint64_t idx = num_elems_per_block * blockIdx.x + threadIdx.x; idx < num_elems_per_block * (blockIdx.x + 1);
|
||||
idx += blockDim.x) {
|
||||
FetchULong2(val, src_buffer + idx);
|
||||
StoreULong2(dst_buffer + idx + src_rank_elem_offset - dst_rank_elem_offset, val);
|
||||
}
|
||||
}
|
||||
|
||||
// src_rank/dst_rank: < 0 for all ranks, else for specified rank
|
||||
int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank) {
|
||||
int ret = 0;
|
||||
cudaError_t cuda_err = cudaSuccess;
|
||||
int can_access = 0;
|
||||
|
||||
std::vector<uint8_t *> src_buffers_gpu(gpu_count, nullptr);
|
||||
std::vector<uint8_t *> dst_buffers_gpu(gpu_count, nullptr);
|
||||
std::vector<uint8_t **> dst_buffer_gpu_args(gpu_count, nullptr);
|
||||
std::vector<cudaStream_t> streams(gpu_count);
|
||||
std::vector<cudaEvent_t> start_events(gpu_count);
|
||||
std::vector<cudaEvent_t> stop_events(gpu_count);
|
||||
|
||||
uint64_t *data_buffer_cpu = nullptr;
|
||||
|
||||
// Scan all GPUs
|
||||
for (int i = 0; i < gpu_count; i++) {
|
||||
for (int j = 0; j < gpu_count; j++) {
|
||||
ret = EnablePeerAccess(i, j, &can_access);
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "RunAllToAllBench::EnablePeerAccess between GPU %d and GPU %d failed with %d\n", i, j,
|
||||
ret);
|
||||
return -1;
|
||||
}
|
||||
if (!can_access) {
|
||||
fprintf(stderr, "RunAllToAllBench: GPU %d cannot talk with GPU %d\n", i, j);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare per-GPU resources
|
||||
if (opts.check_data) {
|
||||
data_buffer_cpu = new uint64_t[opts.size / sizeof(uint64_t)];
|
||||
}
|
||||
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
if (SetGpu(rank)) {
|
||||
fprintf(stderr, "RunAllToAllBench::SetGpu for rank %d error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Prepare source buffers
|
||||
cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
if (opts.check_data) {
|
||||
for (uint64_t i = 0; i < opts.size / sizeof(uint64_t); i++) {
|
||||
data_buffer_cpu[i] = i * rank;
|
||||
}
|
||||
cuda_err = cudaMemcpy(src_buffers_gpu[rank], data_buffer_cpu, opts.size, cudaMemcpyDefault);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMemcpy to src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare destination buffers
|
||||
cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Prepare streams
|
||||
cuda_err = cudaStreamCreateWithFlags(&(streams[rank]), cudaStreamNonBlocking);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaStreamCreateWithFlags error: %d\n", cuda_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Prepare events
|
||||
cuda_err = cudaEventCreate(&(start_events[rank]));
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaEventCreate for start_events[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaEventCreate(&(stop_events[rank]));
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaEventCreate for stop_events[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare kernel arguments
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
if (SetGpu(rank)) {
|
||||
fprintf(stderr, "RunAllToAllBench::SetGpu for rank %d error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Prepare destination buffer args
|
||||
cuda_err = cudaMalloc(&(dst_buffer_gpu_args[rank]), sizeof(uint8_t *) * gpu_count);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffer_gpu_args[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaMemcpy(dst_buffer_gpu_args[rank], dst_buffers_gpu.data(), sizeof(uint8_t *) * gpu_count,
|
||||
cudaMemcpyDefault);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMemcpy to dst_buffer_gpu_args[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Launch jobs and collect running time
|
||||
for (int i = 0; i < opts.num_warm_up + opts.num_loops; i++) {
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
if (src_rank >= 0 && rank != src_rank) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (SetGpu(rank)) {
|
||||
fprintf(stderr, "RunAllToAllBench::SetGpu for rank %d error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (i == opts.num_warm_up) {
|
||||
cuda_err = cudaEventRecord(start_events[rank], streams[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaEventRecord for start_events[%d] error: %d\n", cuda_err,
|
||||
rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
SMOneToAllCopyKernel<<<gpu_count * opts.all_to_all_num_thread_blocks_per_rank,
|
||||
opts.all_to_all_thread_block_size, 0, streams[rank]>>>(
|
||||
(ulong2 **)dst_buffer_gpu_args[rank], (ulong2 *)src_buffers_gpu[rank], opts.size, rank, dst_rank,
|
||||
gpu_count);
|
||||
if (i == opts.num_warm_up + opts.num_loops - 1) {
|
||||
cuda_err = cudaEventRecord(stop_events[rank], streams[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaEventRecord for stop_events[%d] error: %d\n", cuda_err,
|
||||
rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
if (src_rank >= 0 && rank != src_rank) {
|
||||
continue;
|
||||
}
|
||||
cuda_err = cudaStreamSynchronize(streams[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaStreamSynchronize streams[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate and display bandwidth if no problem
|
||||
bool first_bw_seen = false;
|
||||
double min_bw = 0.;
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
if (src_rank >= 0 && rank != src_rank) {
|
||||
continue;
|
||||
}
|
||||
float time_in_ms = 0;
|
||||
cuda_err = cudaEventElapsedTime(&time_in_ms, start_events[rank], stop_events[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaEventElapsedTime for rank %d error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
double bw = opts.size * (gpu_count - 1) * opts.num_loops / gpu_count / time_in_ms / 1e6;
|
||||
if (!first_bw_seen) {
|
||||
min_bw = bw;
|
||||
first_bw_seen = true;
|
||||
} else {
|
||||
min_bw = std::min(min_bw, bw);
|
||||
}
|
||||
}
|
||||
if (src_rank < 0 && dst_rank < 0) {
|
||||
printf("gpu_all_to_gpu_all_write_by_sm %g\n", min_bw);
|
||||
} else if (src_rank < 0) {
|
||||
printf("gpu_all_to_gpu%d_write_by_sm %g\n", dst_rank, min_bw);
|
||||
} else {
|
||||
printf("gpu%d_to_gpu_all_write_by_sm %g\n", src_rank, min_bw);
|
||||
}
|
||||
|
||||
// Check data
|
||||
if (opts.check_data) {
|
||||
for (int curr_dst_rank = 0; curr_dst_rank < gpu_count; curr_dst_rank++) {
|
||||
if (dst_rank >= 0 && dst_rank != curr_dst_rank) {
|
||||
continue;
|
||||
}
|
||||
cuda_err = cudaMemcpy(data_buffer_cpu, dst_buffers_gpu[curr_dst_rank], opts.size, cudaMemcpyDefault);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaMemcpy from dst_buffers_gpu[%d] error: %d\n", cuda_err,
|
||||
curr_dst_rank);
|
||||
return -1;
|
||||
}
|
||||
for (uint64_t i = 0; i < opts.size / sizeof(uint64_t); i++) {
|
||||
int curr_src_rank = i / (opts.size / sizeof(uint64_t) / gpu_count);
|
||||
if (src_rank >= 0 && src_rank != curr_src_rank) {
|
||||
continue;
|
||||
}
|
||||
if (curr_src_rank == curr_dst_rank) {
|
||||
continue;
|
||||
}
|
||||
uint64_t offset_in_src_rank = (i % (opts.size / sizeof(uint64_t) / gpu_count)) +
|
||||
curr_dst_rank * (opts.size / sizeof(uint64_t) / gpu_count);
|
||||
if (data_buffer_cpu[i] != offset_in_src_rank * curr_src_rank) {
|
||||
fprintf(stderr,
|
||||
"RunAllToAllBench: data check failure, dst_buffers_gpu[%d][%lu] (%lu) != %lu * %d\n",
|
||||
curr_dst_rank, i, data_buffer_cpu[i], offset_in_src_rank, curr_src_rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Free buffers
|
||||
for (int rank = 0; rank < gpu_count; rank++) {
|
||||
cuda_err = cudaFree(src_buffers_gpu[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaFree for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaFree(dst_buffers_gpu[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaFree for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
cuda_err = cudaFree(dst_buffer_gpu_args[rank]);
|
||||
if (cuda_err != cudaSuccess) {
|
||||
fprintf(stderr, "RunAllToAllBench::cudaFree for dst_buffer_gpu_args[%d] error: %d\n", cuda_err, rank);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
delete[] data_buffer_cpu;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int ret = 0;
|
||||
int numa_count = 0;
|
||||
|
@ -873,5 +1196,30 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (opts.one_to_all_enabled) {
|
||||
for (int i = 0; i < gpu_count; i++) {
|
||||
ret = RunAllToAllBench(opts, gpu_count, i, -1);
|
||||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (opts.all_to_one_enabled) {
|
||||
for (int i = 0; i < gpu_count; i++) {
|
||||
ret = RunAllToAllBench(opts, gpu_count, -1, i);
|
||||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (opts.all_to_all_enabled) {
|
||||
ret = RunAllToAllBench(opts, gpu_count, -1, -1);
|
||||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -30,12 +30,16 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
size = 1048576
|
||||
num_warm_up = 20
|
||||
num_loops = 10000
|
||||
mem_types = ['htod', 'dtoh', 'dtod']
|
||||
all_to_all_num_thread_blocks_per_rank = 8
|
||||
all_to_all_thread_block_size = 512
|
||||
mem_types = ['htod', 'dtoh', 'dtod', 'one_to_all', 'all_to_one', 'all_to_all']
|
||||
copy_types = ['sm', 'dma']
|
||||
|
||||
parameters = '--mem_type %s --copy_type %s --size %d ' \
|
||||
'--num_warm_up %d --num_loops %d --bidirectional --check_data' % \
|
||||
(' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
|
||||
parameters = '--mem_type %s --copy_type %s --size %d --num_warm_up %d --num_loops %d ' \
|
||||
'--all_to_all_num_thread_blocks_per_rank %d --all_to_all_thread_block_size %d ' \
|
||||
'--bidirectional --check_data' % \
|
||||
(' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops,
|
||||
all_to_all_num_thread_blocks_per_rank, all_to_all_thread_block_size)
|
||||
benchmark = benchmark_class(benchmark_name, parameters=parameters)
|
||||
|
||||
# Check basic information
|
||||
|
@ -52,6 +56,8 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
assert (benchmark._args.size == size)
|
||||
assert (benchmark._args.num_warm_up == num_warm_up)
|
||||
assert (benchmark._args.num_loops == num_loops)
|
||||
assert (benchmark._args.all_to_all_num_thread_blocks_per_rank == all_to_all_num_thread_blocks_per_rank)
|
||||
assert (benchmark._args.all_to_all_thread_block_size == all_to_all_thread_block_size)
|
||||
assert (benchmark._args.bidirectional)
|
||||
assert (benchmark._args.check_data)
|
||||
|
||||
|
@ -65,6 +71,11 @@ class GpuCopyBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
|||
assert ('--size %d' % size in benchmark._commands[0])
|
||||
assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
|
||||
assert ('--num_loops %d' % num_loops in benchmark._commands[0])
|
||||
assert (
|
||||
'--all_to_all_num_thread_blocks_per_rank %d' % all_to_all_num_thread_blocks_per_rank
|
||||
in benchmark._commands[0]
|
||||
)
|
||||
assert ('--all_to_all_thread_block_size %d' % all_to_all_thread_block_size in benchmark._commands[0])
|
||||
assert ('--bidirectional' in benchmark._commands[0])
|
||||
assert ('--check_data' in benchmark._commands[0])
|
||||
|
||||
|
|
|
@ -50,3 +50,8 @@ cpu_and_gpu1_by_sm_under_numa1 10.2994
|
|||
cpu_and_gpu1_by_dma_under_numa1 49.3615
|
||||
gpu1_and_cpu_by_sm_under_numa1 10.2817
|
||||
gpu1_and_cpu_by_dma_under_numa1 49.3653
|
||||
gpu0_to_gpu_all_write_by_sm 55.6522
|
||||
gpu1_to_gpu_all_write_by_sm 55.0538
|
||||
gpu_all_to_gpu0_write_by_sm 56.2637
|
||||
gpu_all_to_gpu1_write_by_sm 56.8889
|
||||
gpu_all_to_gpu_all_write_by_sm 55.6522
|
||||
|
|
Загрузка…
Ссылка в новой задаче