Benchmarks: Revise Code - Eliminate NUMA binding for device-to-device tests in gpu_copy (#302)
**Description** This commit remove NUMA binding for device-to-device tests because NUMA doesn't affect performance, and revise benchmark metrics accordingly.
This commit is contained in:
Родитель
433785fd0c
Коммит
6cdf759543
|
@ -186,16 +186,16 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi
|
|||
|
||||
#### Metrics
|
||||
|
||||
| Name | Unit | Description |
|
||||
|------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | Same as above. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| Name | Unit | Description |
|
||||
|-------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
| cpu\_and\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_and\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | Same as above, but generated by --dtoh --bidirectional. |
|
||||
| gpu[0-9]+\_and\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM. |
|
||||
| gpu[0-9]+\_and\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
|
||||
|
||||
### `ib-loopback`
|
||||
|
||||
|
|
|
@ -544,7 +544,7 @@ void PrintResultTag(const BenchArgs &args) {
|
|||
} else {
|
||||
printf("cpu");
|
||||
}
|
||||
printf("_to_");
|
||||
printf("%s", args.num_subs == 1 ? "_to_" : "_and_");
|
||||
if (args.subs[0].is_dst_dev_gpu) {
|
||||
printf("gpu%d", args.subs[0].dst_gpu_id);
|
||||
} else {
|
||||
|
@ -558,11 +558,9 @@ void PrintResultTag(const BenchArgs &args) {
|
|||
printf("_read");
|
||||
}
|
||||
}
|
||||
printf("_by_%s_under_numa%lu", args.is_sm_copy ? "sm" : "dma", args.numa_id);
|
||||
if (args.num_subs == 1) {
|
||||
printf("_uni");
|
||||
} else {
|
||||
printf("_bi");
|
||||
printf("_by_%s", args.is_sm_copy ? "sm" : "dma");
|
||||
if (!args.subs[0].is_src_dev_gpu || !args.subs[0].is_dst_dev_gpu) {
|
||||
printf("_under_numa%lu", args.numa_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -810,13 +808,16 @@ int main(int argc, char **argv) {
|
|||
args_list.push_back(args);
|
||||
}
|
||||
}
|
||||
if (args.numa_id != 0) {
|
||||
continue;
|
||||
}
|
||||
// Device-to-device benchmark
|
||||
if (opts.dtod_enabled) {
|
||||
// Scan all peers
|
||||
for (int k = 0; k < gpu_count; k++) {
|
||||
// Skip second half for bidirectional test
|
||||
if (opts.bidirectional_enabled && k > j) {
|
||||
break;
|
||||
// src_dev_id always <= dst_dev_id for bidirectional test
|
||||
if (opts.bidirectional_enabled && j > k) {
|
||||
continue;
|
||||
}
|
||||
// P2P write
|
||||
ret = EnablePeerAccess(j, k, &can_access);
|
||||
|
|
|
@ -1,60 +1,52 @@
|
|||
cpu_to_gpu0_by_sm_under_numa0_uni 26.1736
|
||||
cpu_to_gpu0_by_dma_under_numa0_uni 26.1878
|
||||
gpu0_to_cpu_by_sm_under_numa0_uni 5.01589
|
||||
gpu0_to_cpu_by_dma_under_numa0_uni 21.8659
|
||||
gpu0_to_gpu0_by_sm_under_numa0_uni 655.759
|
||||
gpu0_to_gpu0_by_dma_under_numa0_uni 633.325
|
||||
gpu0_to_gpu1_write_by_sm_under_numa0_uni 250.122
|
||||
gpu0_to_gpu1_write_by_dma_under_numa0_uni 274.951
|
||||
gpu0_to_gpu1_read_by_sm_under_numa0_uni 253.563
|
||||
gpu0_to_gpu1_read_by_dma_under_numa0_uni 264.009
|
||||
cpu_to_gpu1_by_sm_under_numa0_uni 26.187
|
||||
cpu_to_gpu1_by_dma_under_numa0_uni 26.207
|
||||
gpu1_to_cpu_by_sm_under_numa0_uni 5.01132
|
||||
gpu1_to_cpu_by_dma_under_numa0_uni 21.8635
|
||||
gpu1_to_gpu0_write_by_sm_under_numa0_uni 249.824
|
||||
gpu1_to_gpu0_write_by_dma_under_numa0_uni 275.123
|
||||
gpu1_to_gpu0_read_by_sm_under_numa0_uni 253.469
|
||||
gpu1_to_gpu0_read_by_dma_under_numa0_uni 264.908
|
||||
gpu1_to_gpu1_by_sm_under_numa0_uni 658.338
|
||||
gpu1_to_gpu1_by_dma_under_numa0_uni 631.148
|
||||
cpu_to_gpu0_by_sm_under_numa1_uni 26.1542
|
||||
cpu_to_gpu0_by_dma_under_numa1_uni 26.2007
|
||||
gpu0_to_cpu_by_sm_under_numa1_uni 5.67356
|
||||
gpu0_to_cpu_by_dma_under_numa1_uni 21.8599
|
||||
gpu0_to_gpu0_by_sm_under_numa1_uni 656.935
|
||||
gpu0_to_gpu0_by_dma_under_numa1_uni 631.974
|
||||
gpu0_to_gpu1_write_by_sm_under_numa1_uni 250.118
|
||||
gpu0_to_gpu1_write_by_dma_under_numa1_uni 274.778
|
||||
gpu0_to_gpu1_read_by_sm_under_numa1_uni 253.625
|
||||
gpu0_to_gpu1_read_by_dma_under_numa1_uni 264.347
|
||||
cpu_to_gpu1_by_sm_under_numa1_uni 26.1905
|
||||
cpu_to_gpu1_by_dma_under_numa1_uni 26.2007
|
||||
gpu1_to_cpu_by_sm_under_numa1_uni 5.67716
|
||||
gpu1_to_cpu_by_dma_under_numa1_uni 21.8579
|
||||
gpu1_to_gpu0_write_by_sm_under_numa1_uni 250.064
|
||||
gpu1_to_gpu0_write_by_dma_under_numa1_uni 274.924
|
||||
gpu1_to_gpu0_read_by_sm_under_numa1_uni 253.746
|
||||
gpu1_to_gpu0_read_by_dma_under_numa1_uni 264.256
|
||||
gpu1_to_gpu1_by_sm_under_numa1_uni 655.623
|
||||
gpu1_to_gpu1_by_dma_under_numa1_uni 634.062
|
||||
cpu_to_gpu0_by_sm_under_numa0_bi 8.45975
|
||||
cpu_to_gpu0_by_dma_under_numa0_bi 36.4282
|
||||
gpu0_to_gpu0_by_sm_under_numa0_bi 689.063
|
||||
gpu0_to_gpu0_by_dma_under_numa0_bi 661.7
|
||||
gpu0_to_gpu1_write_by_sm_under_numa0_bi 427.446
|
||||
gpu0_to_gpu1_write_by_dma_under_numa0_bi 521.577
|
||||
gpu0_to_gpu1_read_by_sm_under_numa0_bi 446.835
|
||||
gpu0_to_gpu1_read_by_dma_under_numa0_bi 503.158
|
||||
cpu_to_gpu1_by_sm_under_numa0_bi 8.4487
|
||||
cpu_to_gpu1_by_dma_under_numa0_bi 36.4272
|
||||
cpu_to_gpu0_by_sm_under_numa1_bi 9.36164
|
||||
cpu_to_gpu0_by_dma_under_numa1_bi 36.411
|
||||
gpu0_to_gpu0_by_sm_under_numa1_bi 688.156
|
||||
gpu0_to_gpu0_by_dma_under_numa1_bi 662.077
|
||||
gpu0_to_gpu1_write_by_sm_under_numa1_bi 427.033
|
||||
gpu0_to_gpu1_write_by_dma_under_numa1_bi 521.367
|
||||
gpu0_to_gpu1_read_by_sm_under_numa1_bi 446.179
|
||||
gpu0_to_gpu1_read_by_dma_under_numa1_bi 503.843
|
||||
cpu_to_gpu1_by_sm_under_numa1_bi 9.37368
|
||||
cpu_to_gpu1_by_dma_under_numa1_bi 36.4128
|
||||
cpu_to_gpu0_by_sm_under_numa0 26.2409
|
||||
cpu_to_gpu0_by_dma_under_numa0 26.2387
|
||||
gpu0_to_cpu_by_sm_under_numa0 5.67346
|
||||
gpu0_to_cpu_by_dma_under_numa0 25.8516
|
||||
gpu0_to_gpu0_by_sm 682.667
|
||||
gpu0_to_gpu0_by_dma 657.332
|
||||
gpu0_to_gpu1_write_by_sm 258.397
|
||||
gpu0_to_gpu1_write_by_dma 279.287
|
||||
gpu0_to_gpu1_read_by_sm 261.856
|
||||
gpu0_to_gpu1_read_by_dma 275.854
|
||||
cpu_to_gpu1_by_sm_under_numa0 26.2401
|
||||
cpu_to_gpu1_by_dma_under_numa0 26.2392
|
||||
gpu1_to_cpu_by_sm_under_numa0 5.67114
|
||||
gpu1_to_cpu_by_dma_under_numa0 26.0584
|
||||
gpu1_to_gpu0_write_by_sm 258.729
|
||||
gpu1_to_gpu0_write_by_dma 278.308
|
||||
gpu1_to_gpu0_read_by_sm 261.804
|
||||
gpu1_to_gpu0_read_by_dma 275.825
|
||||
gpu1_to_gpu1_by_sm 682.311
|
||||
gpu1_to_gpu1_by_dma 656.673
|
||||
cpu_to_gpu0_by_sm_under_numa1 26.2414
|
||||
cpu_to_gpu0_by_dma_under_numa1 26.2332
|
||||
gpu0_to_cpu_by_sm_under_numa1 6.40701
|
||||
gpu0_to_cpu_by_dma_under_numa1 26.104
|
||||
cpu_to_gpu1_by_sm_under_numa1 26.2404
|
||||
cpu_to_gpu1_by_dma_under_numa1 26.2412
|
||||
gpu1_to_cpu_by_sm_under_numa1 6.40865
|
||||
gpu1_to_cpu_by_dma_under_numa1 26.0804
|
||||
cpu_and_gpu0_by_sm_under_numa0 9.31711
|
||||
cpu_and_gpu0_by_dma_under_numa0 49.4624
|
||||
gpu0_and_cpu_by_sm_under_numa0 9.32671
|
||||
gpu0_and_cpu_by_dma_under_numa0 49.4572
|
||||
gpu0_and_gpu0_by_sm 685.523
|
||||
gpu0_and_gpu0_by_dma 666.016
|
||||
gpu0_and_gpu1_write_by_sm 440.023
|
||||
gpu0_and_gpu1_write_by_dma 531.244
|
||||
gpu0_and_gpu1_read_by_sm 460.831
|
||||
gpu0_and_gpu1_read_by_dma 526.288
|
||||
cpu_and_gpu1_by_sm_under_numa0 9.29908
|
||||
cpu_and_gpu1_by_dma_under_numa0 49.4357
|
||||
gpu1_and_cpu_by_sm_under_numa0 9.32654
|
||||
gpu1_and_cpu_by_dma_under_numa0 49.4429
|
||||
gpu1_and_gpu1_by_sm 672.768
|
||||
gpu1_and_gpu1_by_dma 665.763
|
||||
cpu_and_gpu0_by_sm_under_numa1 10.2742
|
||||
cpu_and_gpu0_by_dma_under_numa1 49.3646
|
||||
gpu0_and_cpu_by_sm_under_numa1 10.2896
|
||||
gpu0_and_cpu_by_dma_under_numa1 49.3639
|
||||
cpu_and_gpu1_by_sm_under_numa1 10.2994
|
||||
cpu_and_gpu1_by_dma_under_numa1 49.3615
|
||||
gpu1_and_cpu_by_sm_under_numa1 10.2817
|
||||
gpu1_and_cpu_by_dma_under_numa1 49.3653
|
||||
|
|
Загрузка…
Ссылка в новой задаче