Constant and local memory probing

This commit is contained in:
PENGUINLIONG 2022-05-08 14:20:57 +08:00
Родитель e8e45ec6e2
Коммит 99ace88b16
10 изменённых файлов: 416 добавлений и 14 удалений

Просмотреть файл

@ -50,7 +50,7 @@ A GPU hardware has many traits like GFLOPS and cache size. ArchProbe implements
- `RegCount` Number of registers available to a thread and whether the register file is shared among warps;
- `BufferVecWidth` Optimal vector width to read the most data in a single memory access;
- `{Image|Buffer}CachelineSize` Top level cacheline size of image/buffer;
- `{Image|Buffer}Bandwidth` Peak read-only bandwidth of image/buffer;
- `{Image|Buffer|ConstMem|LocalMem}Bandwidth` Peak read-only bandwidth of image/buffer/constant/local memory;
- `{Image|Buffer}CacheHierarchyPChase` Size of each level of cache of image/buffer by the P-chase method.
If the `-v` flag is given, ArchProbe prints extra human-readable logs to `stdout` which is also a good source of information.

Просмотреть файл

@ -1019,6 +1019,198 @@ namespace aspects {
env.report_ready(done);
}
void const_mem_bandwidth(Environment& env) {
if (env.report_started_lazy("ConstMemBandwidth")) { return; }
env.init_table("range (byte)", "t (us)", "bandwidth (gbps)");
bool done = true;
const int NTHREAD_WARP =
env.must_get_aspect_report<uint32_t>("Device", "LogicThreadCount");
const int NSM =
env.must_get_aspect_report<uint32_t>("Device", "SmCount");
const size_t RANGE =
env.must_get_aspect_report<uint32_t>("Device", "MaxConstMemSize");
// Size configs in bytes. These settings should be adjusted by hand.
const uint32_t VEC_WIDTH = 4;
const uint32_t NFLUSH = 16;
const uint32_t NUNROLL = 16;
const uint32_t NITER = 4;
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
const size_t VEC_SIZE = VEC_WIDTH * sizeof(float);
auto bench = [&](size_t access_size) {
const size_t CACHE_SIZE = access_size;
const size_t NVEC = RANGE / VEC_SIZE;
const size_t NVEC_CACHE = CACHE_SIZE / VEC_SIZE;
// The thread count is doesn't divide by thread workload basically because
// of the limited memory size. Constant memory and local memory are
// usually sub-MB level but buffer and images can go upto gigs.
const int nthread_total = NVEC;
const int local_x = NTHREAD_WARP;
const int global_x = (nthread_total / local_x * local_x) * NSM * NFLUSH;
//log::debug("local_x=", local_x, "; global_x=", global_x);
auto src = util::format(R"(
__kernel void const_mem_bandwidth(
__constant float4 *A,
__global float4 *B,
__private const int niter,
__private const int addr_mask
) {
float4 sum = 0;
int offset = (get_group_id(0) * )", local_x * NREAD_PER_THREAD,
R"( + get_local_id(0)) & addr_mask;
for (int i = 0; i < niter; ++i)
{)", [&]() {
std::stringstream ss;
for (int i = 0; i < NUNROLL; ++i) {
ss << "sum *= A[offset]; offset = (offset + " << local_x
<< ") & addr_mask;\n";
}
return ss.str();
}(), R"(
}
B[get_local_id(0)] = sum;
})");
//log::debug(src);
cl::Program program = env.create_program(src, "");
cl::Kernel kernel = env.create_kernel(program, "const_mem_bandwidth");
cl::Buffer in_buf = env.create_buf(0, CACHE_SIZE);
cl::Buffer out_buf = env.create_buf(0, VEC_SIZE * NTHREAD_WARP);
cl::NDRange global(global_x, 1, 1);
cl::NDRange local(local_x, 1, 1);
kernel.setArg(0, in_buf);
kernel.setArg(1, out_buf);
kernel.setArg(2, int(NITER));
kernel.setArg(3, int(NVEC_CACHE - 1));
auto time = env.bench_kernel(kernel, local, global, 10);
const size_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
auto gbps = SIZE_TRANS * 1e-3 / time;
log::debug("constant memory bandwidth accessing ", access_size,
"B unique data is ", gbps, " gbps (", time, " us)");
env.table().push(access_size, time, gbps);
return gbps;
};
MaxStats<double> max_bandwidth {};
MinStats<double> min_bandwidth {};
for (size_t access_size = VEC_SIZE; access_size < RANGE; access_size *= 2) {
double gbps = bench(access_size);
max_bandwidth.push(gbps);
min_bandwidth.push(gbps);
}
env.report_value("MaxBandwidth", max_bandwidth);
env.report_value("MinBandwidth", min_bandwidth);
log::info("discovered constant memory read bandwidth min=",
(double)min_bandwidth, "; max=", (double)max_bandwidth);
env.report_ready(done);
}
void local_mem_bandwidth(Environment& env) {
if (env.report_started_lazy("LocalMemBandwidth")) { return; }
env.init_table("range (byte)", "t (us)", "bandwidth (gbps)");
bool done = true;
const int NTHREAD_LOGIC =
env.must_get_aspect_report<uint32_t>("Device", "LogicThreadCount");
const int NSM =
env.must_get_aspect_report<uint32_t>("Device", "SmCount");
const size_t RANGE =
env.must_get_aspect_report<uint32_t>("Device", "MaxLocalMemSize");
// Size configs in bytes. These settings should be adjusted by hand.
const uint32_t VEC_WIDTH = 4;
const uint32_t NFLUSH = 16;
const uint32_t NUNROLL = 16;
const uint32_t NITER = 4;
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
const size_t VEC_SIZE = VEC_WIDTH * sizeof(float);
auto bench = [&](size_t access_size) {
const size_t CACHE_SIZE = access_size;
const size_t NVEC = RANGE / VEC_SIZE;
const size_t NVEC_CACHE = CACHE_SIZE / VEC_SIZE;
const int nthread_total = NVEC;
const int local_x = NTHREAD_LOGIC;
const int global_x = (nthread_total / local_x * local_x) * NSM * NFLUSH;
//log::debug("local_x=", local_x, "; global_x=", global_x);
auto src = util::format(R"(
__kernel void local_mem_bandwidth(
__global float4 *B,
__private const int niter,
__private const int addr_mask
) {
__local float4 A[)", CACHE_SIZE / VEC_SIZE, R"(];
A[get_local_id(0)] = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = 0;
int offset = (get_group_id(0) * )", local_x * NREAD_PER_THREAD,
R"( + get_local_id(0)) & addr_mask;
for (int i = 0; i < niter; ++i)
{)", [&]() {
std::stringstream ss;
for (int i = 0; i < NUNROLL; ++i) {
ss << "sum *= A[offset]; offset = (offset + " << local_x
<< ") & addr_mask;\n";
}
return ss.str();
}(), R"(
}
B[get_local_id(0)] = sum;
})");
//log::debug(src);
cl::Program program = env.create_program(src, "");
cl::Kernel kernel = env.create_kernel(program, "local_mem_bandwidth");
cl::Buffer out_buf = env.create_buf(0, VEC_SIZE * NTHREAD_LOGIC);
cl::NDRange global(global_x, 1, 1);
cl::NDRange local(local_x, 1, 1);
kernel.setArg(0, out_buf);
kernel.setArg(1, int(NITER));
kernel.setArg(2, int(NVEC_CACHE - 1));
auto time = env.bench_kernel(kernel, local, global, 10);
const size_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
auto gbps = SIZE_TRANS * 1e-3 / time;
log::debug("local memory bandwidth accessing ", access_size,
"B unique data is ", gbps, " gbps (", time, " us)");
env.table().push(access_size, time, gbps);
return gbps;
};
MaxStats<double> max_bandwidth {};
MinStats<double> min_bandwidth {};
for (size_t access_size = VEC_SIZE; access_size < RANGE; access_size *= 2) {
double gbps = bench(access_size);
max_bandwidth.push(gbps);
min_bandwidth.push(gbps);
}
env.report_value("MaxBandwidth", max_bandwidth);
env.report_value("MinBandwidth", min_bandwidth);
log::info("discovered local memory read bandwidth min=",
(double)min_bandwidth, "; max=", (double)max_bandwidth);
env.report_ready(done);
}
// This aspect tests the warping of the SMs. A warp is an atomic schedule unit
// where all threads in a warp can be executed in parallel. An GPU SM usually
// consumes more threads than it physically can so that it can hide the
@ -1497,16 +1689,18 @@ void guarded_main(const std::string& clear_aspect) {
APP = std::make_unique<ArchProbe>(0);
APP->clear_aspect_report(clear_aspect);
(*APP)
.with_aspect(aspects::warp_size)
.with_aspect(aspects::gflops)
.with_aspect(aspects::reg_count)
.with_aspect(aspects::buf_vec_width)
.with_aspect(aspects::img_cacheline_size)
.with_aspect(aspects::buf_cacheline_size)
.with_aspect(aspects::img_bandwidth)
.with_aspect(aspects::buf_bandwidth)
.with_aspect(aspects::img_cache_hierarchy_pchase)
.with_aspect(aspects::buf_cache_hierarchy_pchase)
//.with_aspect(aspects::warp_size)
//.with_aspect(aspects::gflops)
//.with_aspect(aspects::reg_count)
//.with_aspect(aspects::buf_vec_width)
//.with_aspect(aspects::img_cacheline_size)
//.with_aspect(aspects::buf_cacheline_size)
//.with_aspect(aspects::img_bandwidth)
//.with_aspect(aspects::buf_bandwidth)
.with_aspect(aspects::const_mem_bandwidth)
.with_aspect(aspects::local_mem_bandwidth)
//.with_aspect(aspects::img_cache_hierarchy_pchase)
//.with_aspect(aspects::buf_cache_hierarchy_pchase)
;
APP.reset();

Просмотреть файл

@ -27,6 +27,11 @@ DeviceReport collect_dev_report(const cl::Device& dev) {
dev.getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
dev_report.buf_size_max = dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
dev_report.buf_cache_size = dev.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
// Special memory detail.
dev_report.const_mem_size_max =
dev.getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>();
dev_report.local_mem_size_max =
dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
// Image memory detail.
dev_report.support_img = dev.getInfo<CL_DEVICE_IMAGE_SUPPORT>();
if (dev_report.support_img) {
@ -108,6 +113,8 @@ void report_dev(Environment& env) {
env.report_value("SmCount", env.dev_report.nsm);
env.report_value("LogicThreadCount", env.dev_report.nthread_logic);
env.report_value("MaxBufferSize", env.dev_report.buf_size_max);
env.report_value("MaxConstMemSize", env.dev_report.const_mem_size_max);
env.report_value("MaxLocalMemSize", env.dev_report.local_mem_size_max);
env.report_value("CacheSize", env.dev_report.buf_cache_size);
env.report_value("CachelineSize", env.dev_report.buf_cacheline_size);
if (env.dev_report.support_img) {

Просмотреть файл

@ -32,6 +32,9 @@ struct DeviceReport {
size_t buf_size_max;
size_t buf_cache_size;
size_t const_mem_size_max;
size_t local_mem_size_max;
bool support_img;
uint32_t img_width_max;
uint32_t img_height_max;

Просмотреть файл

@ -1 +1,42 @@
{"BufferCacheHierarchyPChase":{"Compensate":0.01,"DataSizeMax":8.38861e+06,"Threshold":10},"BufferCachelineSize":{"Compensate":0.01,"Threshold":10},"BufferVecWidth":{"Compensate":0.01,"Threshold":10},"Gflops":{"Compensate":0.01,"Threshold":10},"ImageCacheHierarchyPChase":{"Compensate":0.01,"DataSizeMax":262144,"Threshold":10},"ImageCachelineSize":{"Compensate":0.01,"Threshold":10},"RegCount":{"Compensate":0.01,"NGrpMax":64,"NGrpMin":1,"NGrpStep":1,"NRegMax":512,"NRegMin":1,"NRegStep":1,"Threshold":10},"WarpSizeMethodB":{"Compensate":0.01,"Threshold":10}}
{
"BufferCacheHierarchyPChase": {
"Compensate": 0.01,
"DataSizeMax": 8.38861e+06,
"Threshold": 10
},
"BufferCachelineSize": {
"Compensate": 0.01,
"Threshold": 10
},
"BufferVecWidth": {
"Compensate": 0.01,
"Threshold": 10
},
"Gflops": {
"Compensate": 0.01,
"Threshold": 10
},
"ImageCacheHierarchyPChase": {
"Compensate": 0.01,
"DataSizeMax": 262144,
"Threshold": 10
},
"ImageCachelineSize": {
"Compensate": 0.01,
"Threshold": 10
},
"RegCount": {
"Compensate": 0.01,
"NGrpMax": 64,
"NGrpMin": 1,
"NGrpStep": 1,
"NRegMax": 512,
"NRegMin": 1,
"NRegStep": 1,
"Threshold": 10
},
"WarpSizeMethodB": {
"Compensate": 0.01,
"Threshold": 10
}
}

Просмотреть файл

@ -1 +1,86 @@
{"BufferBandwidth":{"Done":true,"MaxBandwidth":96.3256,"MinBandwidth":30.0554},"BufferCacheHierarchyPChase":{"CacheVectorCountLv1":125744,"CacheVectorCountLv2":132608,"CacheVectorCountLv3":133728,"CacheVectorCountLv4":136080,"Done":true},"BufferCachelineSize":{"BufTopLevelCachelineSize":64,"Done":true},"BufferVecWidth":{"BufferVecSize":4,"Done":true},"Device":{"CacheSize":131072,"CachelineSize":64,"Done":true,"LogicThreadCount":1024,"MaxBufferSize":2.87688e+09,"MaxImageHeight":16384,"MaxImageWidth":16384,"PageSize_QCOM":4096,"SmCount":2},"Gflops":{"Done":true,"FloatArch":"SISD","FloatGflops":889.891,"FloatVecComponentCount":1,"HalfArch":"SISD","HalfGflops":890.087,"HalfVecComponentCount":1},"ImageBandwidth":{"Done":true,"MaxBandwidth":194.55,"MinBandwidth":68.174},"ImageCacheHierarchyPChase":{"CachePixelCountLv1":1024,"CachePixelCountLv2":139504,"Done":true},"ImageCachelineSize":{"Done":true,"ImgCachelineDim":"X","ImgCachelineSize":32,"ImgMinTimeConcurThreadCountX":64,"ImgMinTimeConcurThreadCountY":32},"RegCount":{"Done":true,"FullRegConcurWorkgroupCount":12,"HalfRegConcurWorkgroupCount":24,"RegCount":183,"RegType":"Pooled"},"WarpSizeMethodA":{"Done":true,"WarpThreadCount":128},"WarpSizeMethodB":{"Done":true,"WarpThreadCount":64}}
{
"BufferBandwidth": {
"Done": true,
"MaxBandwidth": 96.3256,
"MinBandwidth": 30.0554
},
"BufferCacheHierarchyPChase": {
"CacheVectorCountLv1": 125744,
"CacheVectorCountLv2": 132608,
"CacheVectorCountLv3": 133728,
"CacheVectorCountLv4": 136080,
"Done": true
},
"BufferCachelineSize": {
"BufTopLevelCachelineSize": 64,
"Done": true
},
"BufferVecWidth": {
"BufferVecSize": 4,
"Done": true
},
"ConstMemBandwidth": {
"Done": true,
"MaxBandwidth": 95.2991,
"MinBandwidth": 35.801
},
"Device": {
"CacheSize": 131072,
"CachelineSize": 64,
"Done": true,
"LogicThreadCount": 1024,
"MaxBufferSize": 2.86618e+09,
"MaxConstMemSize": 65536,
"MaxImageHeight": 16384,
"MaxImageWidth": 16384,
"MaxLocalMemSize": 32768,
"PageSize_QCOM": 4096,
"SmCount": 2
},
"Gflops": {
"Done": true,
"FloatArch": "SISD",
"FloatGflops": 889.891,
"FloatVecComponentCount": 1,
"HalfArch": "SISD",
"HalfGflops": 890.087,
"HalfVecComponentCount": 1
},
"ImageBandwidth": {
"Done": true,
"MaxBandwidth": 194.55,
"MinBandwidth": 68.174
},
"ImageCacheHierarchyPChase": {
"CachePixelCountLv1": 1024,
"CachePixelCountLv2": 139504,
"Done": true
},
"ImageCachelineSize": {
"Done": true,
"ImgCachelineDim": "X",
"ImgCachelineSize": 32,
"ImgMinTimeConcurThreadCountX": 64,
"ImgMinTimeConcurThreadCountY": 32
},
"LocalMemBandwidth": {
"Done": true,
"MaxBandwidth": 145.798,
"MinBandwidth": 98.7546
},
"RegCount": {
"Done": true,
"FullRegConcurWorkgroupCount": 12,
"HalfRegConcurWorkgroupCount": 24,
"RegCount": 183,
"RegType": "Pooled"
},
"WarpSizeMethodA": {
"Done": true,
"WarpThreadCount": 128
},
"WarpSizeMethodB": {
"Done": true,
"WarpThreadCount": 64
}
}

Просмотреть файл

@ -0,0 +1,13 @@
range (byte),t (us),bandwidth (gbps)
16,1896.45,70.7732
32,1895.04,70.8258
64,3641.47,36.8581
128,3748.99,35.801
256,2089.6,64.2313
512,1963.52,68.3557
1024,1934.08,69.3962
2048,1437.06,93.3977
4096,1432.58,93.6898
8192,1412.48,95.0227
16384,1408.38,95.2991
32768,1412.22,95.04
1 range (byte) t (us) bandwidth (gbps)
2 16 1896.45 70.7732
3 32 1895.04 70.8258
4 64 3641.47 36.8581
5 128 3748.99 35.801
6 256 2089.6 64.2313
7 512 1963.52 68.3557
8 1024 1934.08 69.3962
9 2048 1437.06 93.3977
10 4096 1432.58 93.6898
11 8192 1412.48 95.0227
12 16384 1408.38 95.2991
13 32768 1412.22 95.04

Просмотреть файл

@ -0,0 +1,12 @@
range (byte),t (us),bandwidth (gbps)
16,462.208,145.192
32,460.288,145.798
64,479.872,139.847
128,667.136,100.592
256,679.552,98.7546
512,661.12,101.508
1024,659.968,101.685
2048,653.056,102.761
4096,652.032,102.923
8192,651.008,103.085
16384,651.008,103.085
1 range (byte) t (us) bandwidth (gbps)
2 16 462.208 145.192
3 32 460.288 145.798
4 64 479.872 139.847
5 128 667.136 100.592
6 256 679.552 98.7546
7 512 661.12 101.508
8 1024 659.968 101.685
9 2048 653.056 102.761
10 4096 652.032 102.923
11 8192 651.008 103.085
12 16384 651.008 103.085

Просмотреть файл

@ -20,6 +20,8 @@ ninja: no work to do.
[INFO] reported 'SmCount' = '2'
[INFO] reported 'LogicThreadCount' = '1024'
[INFO] reported 'MaxBufferSize' = '2876878848'
[INFO] reported 'MaxConstMemSize' = '65536'
[INFO] reported 'MaxLocalMemSize' = '32768'
[INFO] reported 'CacheSize' = '131072'
[INFO] reported 'CachelineSize' = '64'
[INFO] reported 'MaxImageWidth' = '16384'
@ -757,6 +759,51 @@ ninja: no work to do.
[INFO] discovered buffer read bandwidth min=30.0554; max=96.3256
[INFO] reported 'Done' = '1'
[INFO] saved data table to 'BufferBandwidth.csv'
[INFO] [ConstMemBandwidth]
[INFO] initialized table for aspect 'ConstMemBandwidth'
[INFO] already know that 'LogicThreadCount' from aspect 'Device' is 1024
[INFO] already know that 'SmCount' from aspect 'Device' is 2
[INFO] already know that 'MaxConstMemSize' from aspect 'Device' is 65536
[DEBUG] constant memory bandwidth accessing 16B unique data is 70.6921 gbps (1898.62 us)
[DEBUG] constant memory bandwidth accessing 32B unique data is 70.778 gbps (1896.32 us)
[DEBUG] constant memory bandwidth accessing 64B unique data is 36.8672 gbps (3640.58 us)
[DEBUG] constant memory bandwidth accessing 128B unique data is 35.8255 gbps (3746.43 us)
[DEBUG] constant memory bandwidth accessing 256B unique data is 64.1723 gbps (2091.52 us)
[DEBUG] constant memory bandwidth accessing 512B unique data is 68.3512 gbps (1963.65 us)
[DEBUG] constant memory bandwidth accessing 1024B unique data is 69.4329 gbps (1933.06 us)
[DEBUG] constant memory bandwidth accessing 2048B unique data is 93.2814 gbps (1438.85 us)
[DEBUG] constant memory bandwidth accessing 4096B unique data is 93.6982 gbps (1432.45 us)
[DEBUG] constant memory bandwidth accessing 8192B unique data is 94.9539 gbps (1413.5 us)
[DEBUG] constant memory bandwidth accessing 16384B unique data is 95.2645 gbps (1408.9 us)
[DEBUG] constant memory bandwidth accessing 32768B unique data is 95.1176 gbps (1411.07 us)
[WARN] aspect report ('ConstMemBandwidth') is invalid, a new record is created
[INFO] reported 'MaxBandwidth' = '95.2645'
[INFO] reported 'MinBandwidth' = '35.8255'
[INFO] discovered constant memory read bandwidth min=35.8255; max=95.2645
[INFO] reported 'Done' = '1'
[INFO] saved data table to 'ConstMemBandwidth.csv'
[INFO] [LocalMemBandwidth]
[INFO] initialized table for aspect 'LocalMemBandwidth'
[INFO] already know that 'LogicThreadCount' from aspect 'Device' is 1024
[INFO] already know that 'SmCount' from aspect 'Device' is 2
[INFO] already know that 'MaxLocalMemSize' from aspect 'Device' is 32768
[DEBUG] local memory bandwidth accessing 16B unique data is 145.757 gbps (460.416 us)
[DEBUG] local memory bandwidth accessing 32B unique data is 145.434 gbps (461.44 us)
[DEBUG] local memory bandwidth accessing 64B unique data is 139.847 gbps (479.872 us)
[DEBUG] local memory bandwidth accessing 128B unique data is 100.708 gbps (666.368 us)
[DEBUG] local memory bandwidth accessing 256B unique data is 98.8104 gbps (679.168 us)
[DEBUG] local memory bandwidth accessing 512B unique data is 101.37 gbps (662.016 us)
[DEBUG] local memory bandwidth accessing 1024B unique data is 101.685 gbps (659.968 us)
[DEBUG] local memory bandwidth accessing 2048B unique data is 102.902 gbps (652.16 us)
[DEBUG] local memory bandwidth accessing 4096B unique data is 102.923 gbps (652.032 us)
[DEBUG] local memory bandwidth accessing 8192B unique data is 103.227 gbps (650.112 us)
[DEBUG] local memory bandwidth accessing 16384B unique data is 103.064 gbps (651.136 us)
[WARN] aspect report ('LocalMemBandwidth') is invalid, a new record is created
[INFO] reported 'MaxBandwidth' = '145.757'
[INFO] reported 'MinBandwidth' = '98.8104'
[INFO] discovered local memory read bandwidth min=98.8104; max=145.757
[INFO] reported 'Done' = '1'
[INFO] saved data table to 'LocalMemBandwidth.csv'
[INFO] [ImageCacheHierarchyPChase]
[INFO] initialized table for aspect 'ImageCacheHierarchyPChase'
[INFO] already know that 'MaxImageWidth' from aspect 'Device' is 16384

Просмотреть файл

@ -6,7 +6,7 @@ param(
)
if (-not $Arch) {
$Arch = "armeabi-v7a"
$Arch = "arm64-v8a"
}
if (-not(Test-Path "build-android-$Arch")) {