diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index dcb728c0..1592cdcd 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -186,16 +186,16 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi #### Metrics -| Name | Unit | Description | -|------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------| -| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. | -| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. | -| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM with peer communication enabled. | -| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | -| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. | -| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | Same as above. | -| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM with peer communication enabled. | -| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | +| Name | Unit | Description | +|-------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------| +| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. | +| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. | +| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM. | +| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | +| cpu\_and\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. | +| gpu[0-9]+\_and\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | Same as above, but generated by --dtoh --bidirectional. | +| gpu[0-9]+\_and\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM. | +| gpu[0-9]+\_and\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | ### `ib-loopback` diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index a5398ad5..b62152ba 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -544,7 +544,7 @@ void PrintResultTag(const BenchArgs &args) { } else { printf("cpu"); } - printf("_to_"); + printf("%s", args.num_subs == 1 ? "_to_" : "_and_"); if (args.subs[0].is_dst_dev_gpu) { printf("gpu%d", args.subs[0].dst_gpu_id); } else { @@ -558,11 +558,9 @@ void PrintResultTag(const BenchArgs &args) { printf("_read"); } } - printf("_by_%s_under_numa%lu", args.is_sm_copy ? "sm" : "dma", args.numa_id); - if (args.num_subs == 1) { - printf("_uni"); - } else { - printf("_bi"); + printf("_by_%s", args.is_sm_copy ? "sm" : "dma"); + if (!args.subs[0].is_src_dev_gpu || !args.subs[0].is_dst_dev_gpu) { + printf("_under_numa%lu", args.numa_id); } } @@ -810,13 +808,16 @@ int main(int argc, char **argv) { args_list.push_back(args); } } + if (args.numa_id != 0) { + continue; + } // Device-to-device benchmark if (opts.dtod_enabled) { // Scan all peers for (int k = 0; k < gpu_count; k++) { - // Skip second half for bidirectional test - if (opts.bidirectional_enabled && k > j) { - break; + // src_dev_id always <= dst_dev_id for bidirectional test + if (opts.bidirectional_enabled && j > k) { + continue; } // P2P write ret = EnablePeerAccess(j, k, &can_access); diff --git a/tests/data/gpu_copy_bw_performance.log b/tests/data/gpu_copy_bw_performance.log index a72dd5a8..bf3208f4 100644 --- a/tests/data/gpu_copy_bw_performance.log +++ b/tests/data/gpu_copy_bw_performance.log @@ -1,60 +1,52 @@ -cpu_to_gpu0_by_sm_under_numa0_uni 26.1736 -cpu_to_gpu0_by_dma_under_numa0_uni 26.1878 -gpu0_to_cpu_by_sm_under_numa0_uni 5.01589 -gpu0_to_cpu_by_dma_under_numa0_uni 21.8659 -gpu0_to_gpu0_by_sm_under_numa0_uni 655.759 -gpu0_to_gpu0_by_dma_under_numa0_uni 633.325 -gpu0_to_gpu1_write_by_sm_under_numa0_uni 250.122 -gpu0_to_gpu1_write_by_dma_under_numa0_uni 274.951 -gpu0_to_gpu1_read_by_sm_under_numa0_uni 253.563 -gpu0_to_gpu1_read_by_dma_under_numa0_uni 264.009 -cpu_to_gpu1_by_sm_under_numa0_uni 26.187 -cpu_to_gpu1_by_dma_under_numa0_uni 26.207 -gpu1_to_cpu_by_sm_under_numa0_uni 5.01132 -gpu1_to_cpu_by_dma_under_numa0_uni 21.8635 -gpu1_to_gpu0_write_by_sm_under_numa0_uni 249.824 -gpu1_to_gpu0_write_by_dma_under_numa0_uni 275.123 -gpu1_to_gpu0_read_by_sm_under_numa0_uni 253.469 -gpu1_to_gpu0_read_by_dma_under_numa0_uni 264.908 -gpu1_to_gpu1_by_sm_under_numa0_uni 658.338 -gpu1_to_gpu1_by_dma_under_numa0_uni 631.148 -cpu_to_gpu0_by_sm_under_numa1_uni 26.1542 -cpu_to_gpu0_by_dma_under_numa1_uni 26.2007 -gpu0_to_cpu_by_sm_under_numa1_uni 5.67356 -gpu0_to_cpu_by_dma_under_numa1_uni 21.8599 -gpu0_to_gpu0_by_sm_under_numa1_uni 656.935 -gpu0_to_gpu0_by_dma_under_numa1_uni 631.974 -gpu0_to_gpu1_write_by_sm_under_numa1_uni 250.118 -gpu0_to_gpu1_write_by_dma_under_numa1_uni 274.778 -gpu0_to_gpu1_read_by_sm_under_numa1_uni 253.625 -gpu0_to_gpu1_read_by_dma_under_numa1_uni 264.347 -cpu_to_gpu1_by_sm_under_numa1_uni 26.1905 -cpu_to_gpu1_by_dma_under_numa1_uni 26.2007 -gpu1_to_cpu_by_sm_under_numa1_uni 5.67716 -gpu1_to_cpu_by_dma_under_numa1_uni 21.8579 -gpu1_to_gpu0_write_by_sm_under_numa1_uni 250.064 -gpu1_to_gpu0_write_by_dma_under_numa1_uni 274.924 -gpu1_to_gpu0_read_by_sm_under_numa1_uni 253.746 -gpu1_to_gpu0_read_by_dma_under_numa1_uni 264.256 -gpu1_to_gpu1_by_sm_under_numa1_uni 655.623 -gpu1_to_gpu1_by_dma_under_numa1_uni 634.062 -cpu_to_gpu0_by_sm_under_numa0_bi 8.45975 -cpu_to_gpu0_by_dma_under_numa0_bi 36.4282 -gpu0_to_gpu0_by_sm_under_numa0_bi 689.063 -gpu0_to_gpu0_by_dma_under_numa0_bi 661.7 -gpu0_to_gpu1_write_by_sm_under_numa0_bi 427.446 -gpu0_to_gpu1_write_by_dma_under_numa0_bi 521.577 -gpu0_to_gpu1_read_by_sm_under_numa0_bi 446.835 -gpu0_to_gpu1_read_by_dma_under_numa0_bi 503.158 -cpu_to_gpu1_by_sm_under_numa0_bi 8.4487 -cpu_to_gpu1_by_dma_under_numa0_bi 36.4272 -cpu_to_gpu0_by_sm_under_numa1_bi 9.36164 -cpu_to_gpu0_by_dma_under_numa1_bi 36.411 -gpu0_to_gpu0_by_sm_under_numa1_bi 688.156 -gpu0_to_gpu0_by_dma_under_numa1_bi 662.077 -gpu0_to_gpu1_write_by_sm_under_numa1_bi 427.033 -gpu0_to_gpu1_write_by_dma_under_numa1_bi 521.367 -gpu0_to_gpu1_read_by_sm_under_numa1_bi 446.179 -gpu0_to_gpu1_read_by_dma_under_numa1_bi 503.843 -cpu_to_gpu1_by_sm_under_numa1_bi 9.37368 -cpu_to_gpu1_by_dma_under_numa1_bi 36.4128 +cpu_to_gpu0_by_sm_under_numa0 26.2409 +cpu_to_gpu0_by_dma_under_numa0 26.2387 +gpu0_to_cpu_by_sm_under_numa0 5.67346 +gpu0_to_cpu_by_dma_under_numa0 25.8516 +gpu0_to_gpu0_by_sm 682.667 +gpu0_to_gpu0_by_dma 657.332 +gpu0_to_gpu1_write_by_sm 258.397 +gpu0_to_gpu1_write_by_dma 279.287 +gpu0_to_gpu1_read_by_sm 261.856 +gpu0_to_gpu1_read_by_dma 275.854 +cpu_to_gpu1_by_sm_under_numa0 26.2401 +cpu_to_gpu1_by_dma_under_numa0 26.2392 +gpu1_to_cpu_by_sm_under_numa0 5.67114 +gpu1_to_cpu_by_dma_under_numa0 26.0584 +gpu1_to_gpu0_write_by_sm 258.729 +gpu1_to_gpu0_write_by_dma 278.308 +gpu1_to_gpu0_read_by_sm 261.804 +gpu1_to_gpu0_read_by_dma 275.825 +gpu1_to_gpu1_by_sm 682.311 +gpu1_to_gpu1_by_dma 656.673 +cpu_to_gpu0_by_sm_under_numa1 26.2414 +cpu_to_gpu0_by_dma_under_numa1 26.2332 +gpu0_to_cpu_by_sm_under_numa1 6.40701 +gpu0_to_cpu_by_dma_under_numa1 26.104 +cpu_to_gpu1_by_sm_under_numa1 26.2404 +cpu_to_gpu1_by_dma_under_numa1 26.2412 +gpu1_to_cpu_by_sm_under_numa1 6.40865 +gpu1_to_cpu_by_dma_under_numa1 26.0804 +cpu_and_gpu0_by_sm_under_numa0 9.31711 +cpu_and_gpu0_by_dma_under_numa0 49.4624 +gpu0_and_cpu_by_sm_under_numa0 9.32671 +gpu0_and_cpu_by_dma_under_numa0 49.4572 +gpu0_and_gpu0_by_sm 685.523 +gpu0_and_gpu0_by_dma 666.016 +gpu0_and_gpu1_write_by_sm 440.023 +gpu0_and_gpu1_write_by_dma 531.244 +gpu0_and_gpu1_read_by_sm 460.831 +gpu0_and_gpu1_read_by_dma 526.288 +cpu_and_gpu1_by_sm_under_numa0 9.29908 +cpu_and_gpu1_by_dma_under_numa0 49.4357 +gpu1_and_cpu_by_sm_under_numa0 9.32654 +gpu1_and_cpu_by_dma_under_numa0 49.4429 +gpu1_and_gpu1_by_sm 672.768 +gpu1_and_gpu1_by_dma 665.763 +cpu_and_gpu0_by_sm_under_numa1 10.2742 +cpu_and_gpu0_by_dma_under_numa1 49.3646 +gpu0_and_cpu_by_sm_under_numa1 10.2896 +gpu0_and_cpu_by_dma_under_numa1 49.3639 +cpu_and_gpu1_by_sm_under_numa1 10.2994 +cpu_and_gpu1_by_dma_under_numa1 49.3615 +gpu1_and_cpu_by_sm_under_numa1 10.2817 +gpu1_and_cpu_by_dma_under_numa1 49.3653