зеркало из https://github.com/microsoft/ArchProbe.git
Constant and local memory probing
This commit is contained in:
Родитель
e8e45ec6e2
Коммит
99ace88b16
|
@ -50,7 +50,7 @@ A GPU hardware has many traits like GFLOPS and cache size. ArchProbe implements
|
|||
- `RegCount` Number of registers available to a thread and whether the register file is shared among warps;
|
||||
- `BufferVecWidth` Optimal vector width to read the most data in a single memory access;
|
||||
- `{Image|Buffer}CachelineSize` Top level cacheline size of image/buffer;
|
||||
- `{Image|Buffer}Bandwidth` Peak read-only bandwidth of image/buffer;
|
||||
- `{Image|Buffer|ConstMem|LocalMem}Bandwidth` Peak read-only bandwidth of image/buffer/constant/local memory;
|
||||
- `{Image|Buffer}CacheHierarchyPChase` Size of each level of cache of image/buffer by the P-chase method.
|
||||
|
||||
If the `-v` flag is given, ArchProbe prints extra human-readable logs to `stdout` which is also a good source of information.
|
||||
|
|
|
@ -1019,6 +1019,198 @@ namespace aspects {
|
|||
env.report_ready(done);
|
||||
}
|
||||
|
||||
void const_mem_bandwidth(Environment& env) {
|
||||
if (env.report_started_lazy("ConstMemBandwidth")) { return; }
|
||||
env.init_table("range (byte)", "t (us)", "bandwidth (gbps)");
|
||||
bool done = true;
|
||||
|
||||
const int NTHREAD_WARP =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "LogicThreadCount");
|
||||
const int NSM =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "SmCount");
|
||||
const size_t RANGE =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "MaxConstMemSize");
|
||||
|
||||
// Size configs in bytes. These settings should be adjusted by hand.
|
||||
const uint32_t VEC_WIDTH = 4;
|
||||
const uint32_t NFLUSH = 16;
|
||||
const uint32_t NUNROLL = 16;
|
||||
const uint32_t NITER = 4;
|
||||
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
|
||||
|
||||
const size_t VEC_SIZE = VEC_WIDTH * sizeof(float);
|
||||
auto bench = [&](size_t access_size) {
|
||||
const size_t CACHE_SIZE = access_size;
|
||||
|
||||
const size_t NVEC = RANGE / VEC_SIZE;
|
||||
const size_t NVEC_CACHE = CACHE_SIZE / VEC_SIZE;
|
||||
|
||||
// The thread count is doesn't divide by thread workload basically because
|
||||
// of the limited memory size. Constant memory and local memory are
|
||||
// usually sub-MB level but buffer and images can go upto gigs.
|
||||
const int nthread_total = NVEC;
|
||||
const int local_x = NTHREAD_WARP;
|
||||
const int global_x = (nthread_total / local_x * local_x) * NSM * NFLUSH;
|
||||
//log::debug("local_x=", local_x, "; global_x=", global_x);
|
||||
|
||||
auto src = util::format(R"(
|
||||
__kernel void const_mem_bandwidth(
|
||||
__constant float4 *A,
|
||||
__global float4 *B,
|
||||
__private const int niter,
|
||||
__private const int addr_mask
|
||||
) {
|
||||
float4 sum = 0;
|
||||
int offset = (get_group_id(0) * )", local_x * NREAD_PER_THREAD,
|
||||
R"( + get_local_id(0)) & addr_mask;
|
||||
|
||||
for (int i = 0; i < niter; ++i)
|
||||
{)", [&]() {
|
||||
std::stringstream ss;
|
||||
for (int i = 0; i < NUNROLL; ++i) {
|
||||
ss << "sum *= A[offset]; offset = (offset + " << local_x
|
||||
<< ") & addr_mask;\n";
|
||||
}
|
||||
return ss.str();
|
||||
}(), R"(
|
||||
}
|
||||
B[get_local_id(0)] = sum;
|
||||
})");
|
||||
//log::debug(src);
|
||||
cl::Program program = env.create_program(src, "");
|
||||
cl::Kernel kernel = env.create_kernel(program, "const_mem_bandwidth");
|
||||
|
||||
cl::Buffer in_buf = env.create_buf(0, CACHE_SIZE);
|
||||
cl::Buffer out_buf = env.create_buf(0, VEC_SIZE * NTHREAD_WARP);
|
||||
|
||||
cl::NDRange global(global_x, 1, 1);
|
||||
cl::NDRange local(local_x, 1, 1);
|
||||
|
||||
kernel.setArg(0, in_buf);
|
||||
kernel.setArg(1, out_buf);
|
||||
kernel.setArg(2, int(NITER));
|
||||
kernel.setArg(3, int(NVEC_CACHE - 1));
|
||||
auto time = env.bench_kernel(kernel, local, global, 10);
|
||||
const size_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
|
||||
auto gbps = SIZE_TRANS * 1e-3 / time;
|
||||
log::debug("constant memory bandwidth accessing ", access_size,
|
||||
"B unique data is ", gbps, " gbps (", time, " us)");
|
||||
env.table().push(access_size, time, gbps);
|
||||
return gbps;
|
||||
};
|
||||
|
||||
MaxStats<double> max_bandwidth {};
|
||||
MinStats<double> min_bandwidth {};
|
||||
for (size_t access_size = VEC_SIZE; access_size < RANGE; access_size *= 2) {
|
||||
double gbps = bench(access_size);
|
||||
max_bandwidth.push(gbps);
|
||||
min_bandwidth.push(gbps);
|
||||
}
|
||||
|
||||
env.report_value("MaxBandwidth", max_bandwidth);
|
||||
env.report_value("MinBandwidth", min_bandwidth);
|
||||
|
||||
log::info("discovered constant memory read bandwidth min=",
|
||||
(double)min_bandwidth, "; max=", (double)max_bandwidth);
|
||||
|
||||
env.report_ready(done);
|
||||
}
|
||||
|
||||
void local_mem_bandwidth(Environment& env) {
|
||||
if (env.report_started_lazy("LocalMemBandwidth")) { return; }
|
||||
env.init_table("range (byte)", "t (us)", "bandwidth (gbps)");
|
||||
bool done = true;
|
||||
|
||||
const int NTHREAD_LOGIC =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "LogicThreadCount");
|
||||
const int NSM =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "SmCount");
|
||||
const size_t RANGE =
|
||||
env.must_get_aspect_report<uint32_t>("Device", "MaxLocalMemSize");
|
||||
|
||||
// Size configs in bytes. These settings should be adjusted by hand.
|
||||
const uint32_t VEC_WIDTH = 4;
|
||||
const uint32_t NFLUSH = 16;
|
||||
const uint32_t NUNROLL = 16;
|
||||
const uint32_t NITER = 4;
|
||||
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
|
||||
|
||||
const size_t VEC_SIZE = VEC_WIDTH * sizeof(float);
|
||||
auto bench = [&](size_t access_size) {
|
||||
const size_t CACHE_SIZE = access_size;
|
||||
|
||||
const size_t NVEC = RANGE / VEC_SIZE;
|
||||
const size_t NVEC_CACHE = CACHE_SIZE / VEC_SIZE;
|
||||
|
||||
const int nthread_total = NVEC;
|
||||
const int local_x = NTHREAD_LOGIC;
|
||||
const int global_x = (nthread_total / local_x * local_x) * NSM * NFLUSH;
|
||||
//log::debug("local_x=", local_x, "; global_x=", global_x);
|
||||
|
||||
auto src = util::format(R"(
|
||||
__kernel void local_mem_bandwidth(
|
||||
__global float4 *B,
|
||||
__private const int niter,
|
||||
__private const int addr_mask
|
||||
) {
|
||||
__local float4 A[)", CACHE_SIZE / VEC_SIZE, R"(];
|
||||
A[get_local_id(0)] = get_local_id(0);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float4 sum = 0;
|
||||
int offset = (get_group_id(0) * )", local_x * NREAD_PER_THREAD,
|
||||
R"( + get_local_id(0)) & addr_mask;
|
||||
|
||||
for (int i = 0; i < niter; ++i)
|
||||
{)", [&]() {
|
||||
std::stringstream ss;
|
||||
for (int i = 0; i < NUNROLL; ++i) {
|
||||
ss << "sum *= A[offset]; offset = (offset + " << local_x
|
||||
<< ") & addr_mask;\n";
|
||||
}
|
||||
return ss.str();
|
||||
}(), R"(
|
||||
}
|
||||
B[get_local_id(0)] = sum;
|
||||
})");
|
||||
//log::debug(src);
|
||||
cl::Program program = env.create_program(src, "");
|
||||
cl::Kernel kernel = env.create_kernel(program, "local_mem_bandwidth");
|
||||
|
||||
cl::Buffer out_buf = env.create_buf(0, VEC_SIZE * NTHREAD_LOGIC);
|
||||
|
||||
cl::NDRange global(global_x, 1, 1);
|
||||
cl::NDRange local(local_x, 1, 1);
|
||||
|
||||
kernel.setArg(0, out_buf);
|
||||
kernel.setArg(1, int(NITER));
|
||||
kernel.setArg(2, int(NVEC_CACHE - 1));
|
||||
auto time = env.bench_kernel(kernel, local, global, 10);
|
||||
const size_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
|
||||
auto gbps = SIZE_TRANS * 1e-3 / time;
|
||||
log::debug("local memory bandwidth accessing ", access_size,
|
||||
"B unique data is ", gbps, " gbps (", time, " us)");
|
||||
env.table().push(access_size, time, gbps);
|
||||
return gbps;
|
||||
};
|
||||
|
||||
MaxStats<double> max_bandwidth {};
|
||||
MinStats<double> min_bandwidth {};
|
||||
for (size_t access_size = VEC_SIZE; access_size < RANGE; access_size *= 2) {
|
||||
double gbps = bench(access_size);
|
||||
max_bandwidth.push(gbps);
|
||||
min_bandwidth.push(gbps);
|
||||
}
|
||||
|
||||
env.report_value("MaxBandwidth", max_bandwidth);
|
||||
env.report_value("MinBandwidth", min_bandwidth);
|
||||
|
||||
log::info("discovered local memory read bandwidth min=",
|
||||
(double)min_bandwidth, "; max=", (double)max_bandwidth);
|
||||
|
||||
env.report_ready(done);
|
||||
}
|
||||
|
||||
// This aspect tests the warping of the SMs. A warp is an atomic schedule unit
|
||||
// where all threads in a warp can be executed in parallel. An GPU SM usually
|
||||
// consumes more threads than it physically can so that it can hide the
|
||||
|
@ -1497,16 +1689,18 @@ void guarded_main(const std::string& clear_aspect) {
|
|||
APP = std::make_unique<ArchProbe>(0);
|
||||
APP->clear_aspect_report(clear_aspect);
|
||||
(*APP)
|
||||
.with_aspect(aspects::warp_size)
|
||||
.with_aspect(aspects::gflops)
|
||||
.with_aspect(aspects::reg_count)
|
||||
.with_aspect(aspects::buf_vec_width)
|
||||
.with_aspect(aspects::img_cacheline_size)
|
||||
.with_aspect(aspects::buf_cacheline_size)
|
||||
.with_aspect(aspects::img_bandwidth)
|
||||
.with_aspect(aspects::buf_bandwidth)
|
||||
.with_aspect(aspects::img_cache_hierarchy_pchase)
|
||||
.with_aspect(aspects::buf_cache_hierarchy_pchase)
|
||||
//.with_aspect(aspects::warp_size)
|
||||
//.with_aspect(aspects::gflops)
|
||||
//.with_aspect(aspects::reg_count)
|
||||
//.with_aspect(aspects::buf_vec_width)
|
||||
//.with_aspect(aspects::img_cacheline_size)
|
||||
//.with_aspect(aspects::buf_cacheline_size)
|
||||
//.with_aspect(aspects::img_bandwidth)
|
||||
//.with_aspect(aspects::buf_bandwidth)
|
||||
.with_aspect(aspects::const_mem_bandwidth)
|
||||
.with_aspect(aspects::local_mem_bandwidth)
|
||||
//.with_aspect(aspects::img_cache_hierarchy_pchase)
|
||||
//.with_aspect(aspects::buf_cache_hierarchy_pchase)
|
||||
;
|
||||
|
||||
APP.reset();
|
||||
|
|
|
@ -27,6 +27,11 @@ DeviceReport collect_dev_report(const cl::Device& dev) {
|
|||
dev.getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
|
||||
dev_report.buf_size_max = dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
|
||||
dev_report.buf_cache_size = dev.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
|
||||
// Special memory detail.
|
||||
dev_report.const_mem_size_max =
|
||||
dev.getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>();
|
||||
dev_report.local_mem_size_max =
|
||||
dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
|
||||
// Image memory detail.
|
||||
dev_report.support_img = dev.getInfo<CL_DEVICE_IMAGE_SUPPORT>();
|
||||
if (dev_report.support_img) {
|
||||
|
@ -108,6 +113,8 @@ void report_dev(Environment& env) {
|
|||
env.report_value("SmCount", env.dev_report.nsm);
|
||||
env.report_value("LogicThreadCount", env.dev_report.nthread_logic);
|
||||
env.report_value("MaxBufferSize", env.dev_report.buf_size_max);
|
||||
env.report_value("MaxConstMemSize", env.dev_report.const_mem_size_max);
|
||||
env.report_value("MaxLocalMemSize", env.dev_report.local_mem_size_max);
|
||||
env.report_value("CacheSize", env.dev_report.buf_cache_size);
|
||||
env.report_value("CachelineSize", env.dev_report.buf_cacheline_size);
|
||||
if (env.dev_report.support_img) {
|
||||
|
|
|
@ -32,6 +32,9 @@ struct DeviceReport {
|
|||
size_t buf_size_max;
|
||||
size_t buf_cache_size;
|
||||
|
||||
size_t const_mem_size_max;
|
||||
size_t local_mem_size_max;
|
||||
|
||||
bool support_img;
|
||||
uint32_t img_width_max;
|
||||
uint32_t img_height_max;
|
||||
|
|
|
@ -1 +1,42 @@
|
|||
{"BufferCacheHierarchyPChase":{"Compensate":0.01,"DataSizeMax":8.38861e+06,"Threshold":10},"BufferCachelineSize":{"Compensate":0.01,"Threshold":10},"BufferVecWidth":{"Compensate":0.01,"Threshold":10},"Gflops":{"Compensate":0.01,"Threshold":10},"ImageCacheHierarchyPChase":{"Compensate":0.01,"DataSizeMax":262144,"Threshold":10},"ImageCachelineSize":{"Compensate":0.01,"Threshold":10},"RegCount":{"Compensate":0.01,"NGrpMax":64,"NGrpMin":1,"NGrpStep":1,"NRegMax":512,"NRegMin":1,"NRegStep":1,"Threshold":10},"WarpSizeMethodB":{"Compensate":0.01,"Threshold":10}}
|
||||
{
|
||||
"BufferCacheHierarchyPChase": {
|
||||
"Compensate": 0.01,
|
||||
"DataSizeMax": 8.38861e+06,
|
||||
"Threshold": 10
|
||||
},
|
||||
"BufferCachelineSize": {
|
||||
"Compensate": 0.01,
|
||||
"Threshold": 10
|
||||
},
|
||||
"BufferVecWidth": {
|
||||
"Compensate": 0.01,
|
||||
"Threshold": 10
|
||||
},
|
||||
"Gflops": {
|
||||
"Compensate": 0.01,
|
||||
"Threshold": 10
|
||||
},
|
||||
"ImageCacheHierarchyPChase": {
|
||||
"Compensate": 0.01,
|
||||
"DataSizeMax": 262144,
|
||||
"Threshold": 10
|
||||
},
|
||||
"ImageCachelineSize": {
|
||||
"Compensate": 0.01,
|
||||
"Threshold": 10
|
||||
},
|
||||
"RegCount": {
|
||||
"Compensate": 0.01,
|
||||
"NGrpMax": 64,
|
||||
"NGrpMin": 1,
|
||||
"NGrpStep": 1,
|
||||
"NRegMax": 512,
|
||||
"NRegMin": 1,
|
||||
"NRegStep": 1,
|
||||
"Threshold": 10
|
||||
},
|
||||
"WarpSizeMethodB": {
|
||||
"Compensate": 0.01,
|
||||
"Threshold": 10
|
||||
}
|
||||
}
|
|
@ -1 +1,86 @@
|
|||
{"BufferBandwidth":{"Done":true,"MaxBandwidth":96.3256,"MinBandwidth":30.0554},"BufferCacheHierarchyPChase":{"CacheVectorCountLv1":125744,"CacheVectorCountLv2":132608,"CacheVectorCountLv3":133728,"CacheVectorCountLv4":136080,"Done":true},"BufferCachelineSize":{"BufTopLevelCachelineSize":64,"Done":true},"BufferVecWidth":{"BufferVecSize":4,"Done":true},"Device":{"CacheSize":131072,"CachelineSize":64,"Done":true,"LogicThreadCount":1024,"MaxBufferSize":2.87688e+09,"MaxImageHeight":16384,"MaxImageWidth":16384,"PageSize_QCOM":4096,"SmCount":2},"Gflops":{"Done":true,"FloatArch":"SISD","FloatGflops":889.891,"FloatVecComponentCount":1,"HalfArch":"SISD","HalfGflops":890.087,"HalfVecComponentCount":1},"ImageBandwidth":{"Done":true,"MaxBandwidth":194.55,"MinBandwidth":68.174},"ImageCacheHierarchyPChase":{"CachePixelCountLv1":1024,"CachePixelCountLv2":139504,"Done":true},"ImageCachelineSize":{"Done":true,"ImgCachelineDim":"X","ImgCachelineSize":32,"ImgMinTimeConcurThreadCountX":64,"ImgMinTimeConcurThreadCountY":32},"RegCount":{"Done":true,"FullRegConcurWorkgroupCount":12,"HalfRegConcurWorkgroupCount":24,"RegCount":183,"RegType":"Pooled"},"WarpSizeMethodA":{"Done":true,"WarpThreadCount":128},"WarpSizeMethodB":{"Done":true,"WarpThreadCount":64}}
|
||||
{
|
||||
"BufferBandwidth": {
|
||||
"Done": true,
|
||||
"MaxBandwidth": 96.3256,
|
||||
"MinBandwidth": 30.0554
|
||||
},
|
||||
"BufferCacheHierarchyPChase": {
|
||||
"CacheVectorCountLv1": 125744,
|
||||
"CacheVectorCountLv2": 132608,
|
||||
"CacheVectorCountLv3": 133728,
|
||||
"CacheVectorCountLv4": 136080,
|
||||
"Done": true
|
||||
},
|
||||
"BufferCachelineSize": {
|
||||
"BufTopLevelCachelineSize": 64,
|
||||
"Done": true
|
||||
},
|
||||
"BufferVecWidth": {
|
||||
"BufferVecSize": 4,
|
||||
"Done": true
|
||||
},
|
||||
"ConstMemBandwidth": {
|
||||
"Done": true,
|
||||
"MaxBandwidth": 95.2991,
|
||||
"MinBandwidth": 35.801
|
||||
},
|
||||
"Device": {
|
||||
"CacheSize": 131072,
|
||||
"CachelineSize": 64,
|
||||
"Done": true,
|
||||
"LogicThreadCount": 1024,
|
||||
"MaxBufferSize": 2.86618e+09,
|
||||
"MaxConstMemSize": 65536,
|
||||
"MaxImageHeight": 16384,
|
||||
"MaxImageWidth": 16384,
|
||||
"MaxLocalMemSize": 32768,
|
||||
"PageSize_QCOM": 4096,
|
||||
"SmCount": 2
|
||||
},
|
||||
"Gflops": {
|
||||
"Done": true,
|
||||
"FloatArch": "SISD",
|
||||
"FloatGflops": 889.891,
|
||||
"FloatVecComponentCount": 1,
|
||||
"HalfArch": "SISD",
|
||||
"HalfGflops": 890.087,
|
||||
"HalfVecComponentCount": 1
|
||||
},
|
||||
"ImageBandwidth": {
|
||||
"Done": true,
|
||||
"MaxBandwidth": 194.55,
|
||||
"MinBandwidth": 68.174
|
||||
},
|
||||
"ImageCacheHierarchyPChase": {
|
||||
"CachePixelCountLv1": 1024,
|
||||
"CachePixelCountLv2": 139504,
|
||||
"Done": true
|
||||
},
|
||||
"ImageCachelineSize": {
|
||||
"Done": true,
|
||||
"ImgCachelineDim": "X",
|
||||
"ImgCachelineSize": 32,
|
||||
"ImgMinTimeConcurThreadCountX": 64,
|
||||
"ImgMinTimeConcurThreadCountY": 32
|
||||
},
|
||||
"LocalMemBandwidth": {
|
||||
"Done": true,
|
||||
"MaxBandwidth": 145.798,
|
||||
"MinBandwidth": 98.7546
|
||||
},
|
||||
"RegCount": {
|
||||
"Done": true,
|
||||
"FullRegConcurWorkgroupCount": 12,
|
||||
"HalfRegConcurWorkgroupCount": 24,
|
||||
"RegCount": 183,
|
||||
"RegType": "Pooled"
|
||||
},
|
||||
"WarpSizeMethodA": {
|
||||
"Done": true,
|
||||
"WarpThreadCount": 128
|
||||
},
|
||||
"WarpSizeMethodB": {
|
||||
"Done": true,
|
||||
"WarpThreadCount": 64
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
range (byte),t (us),bandwidth (gbps)
|
||||
16,1896.45,70.7732
|
||||
32,1895.04,70.8258
|
||||
64,3641.47,36.8581
|
||||
128,3748.99,35.801
|
||||
256,2089.6,64.2313
|
||||
512,1963.52,68.3557
|
||||
1024,1934.08,69.3962
|
||||
2048,1437.06,93.3977
|
||||
4096,1432.58,93.6898
|
||||
8192,1412.48,95.0227
|
||||
16384,1408.38,95.2991
|
||||
32768,1412.22,95.04
|
|
|
@ -0,0 +1,12 @@
|
|||
range (byte),t (us),bandwidth (gbps)
|
||||
16,462.208,145.192
|
||||
32,460.288,145.798
|
||||
64,479.872,139.847
|
||||
128,667.136,100.592
|
||||
256,679.552,98.7546
|
||||
512,661.12,101.508
|
||||
1024,659.968,101.685
|
||||
2048,653.056,102.761
|
||||
4096,652.032,102.923
|
||||
8192,651.008,103.085
|
||||
16384,651.008,103.085
|
|
|
@ -20,6 +20,8 @@ ninja: no work to do.
|
|||
[INFO] reported 'SmCount' = '2'
|
||||
[INFO] reported 'LogicThreadCount' = '1024'
|
||||
[INFO] reported 'MaxBufferSize' = '2876878848'
|
||||
[INFO] reported 'MaxConstMemSize' = '65536'
|
||||
[INFO] reported 'MaxLocalMemSize' = '32768'
|
||||
[INFO] reported 'CacheSize' = '131072'
|
||||
[INFO] reported 'CachelineSize' = '64'
|
||||
[INFO] reported 'MaxImageWidth' = '16384'
|
||||
|
@ -757,6 +759,51 @@ ninja: no work to do.
|
|||
[INFO] discovered buffer read bandwidth min=30.0554; max=96.3256
|
||||
[INFO] reported 'Done' = '1'
|
||||
[INFO] saved data table to 'BufferBandwidth.csv'
|
||||
[INFO] [ConstMemBandwidth]
|
||||
[INFO] initialized table for aspect 'ConstMemBandwidth'
|
||||
[INFO] already know that 'LogicThreadCount' from aspect 'Device' is 1024
|
||||
[INFO] already know that 'SmCount' from aspect 'Device' is 2
|
||||
[INFO] already know that 'MaxConstMemSize' from aspect 'Device' is 65536
|
||||
[DEBUG] constant memory bandwidth accessing 16B unique data is 70.6921 gbps (1898.62 us)
|
||||
[DEBUG] constant memory bandwidth accessing 32B unique data is 70.778 gbps (1896.32 us)
|
||||
[DEBUG] constant memory bandwidth accessing 64B unique data is 36.8672 gbps (3640.58 us)
|
||||
[DEBUG] constant memory bandwidth accessing 128B unique data is 35.8255 gbps (3746.43 us)
|
||||
[DEBUG] constant memory bandwidth accessing 256B unique data is 64.1723 gbps (2091.52 us)
|
||||
[DEBUG] constant memory bandwidth accessing 512B unique data is 68.3512 gbps (1963.65 us)
|
||||
[DEBUG] constant memory bandwidth accessing 1024B unique data is 69.4329 gbps (1933.06 us)
|
||||
[DEBUG] constant memory bandwidth accessing 2048B unique data is 93.2814 gbps (1438.85 us)
|
||||
[DEBUG] constant memory bandwidth accessing 4096B unique data is 93.6982 gbps (1432.45 us)
|
||||
[DEBUG] constant memory bandwidth accessing 8192B unique data is 94.9539 gbps (1413.5 us)
|
||||
[DEBUG] constant memory bandwidth accessing 16384B unique data is 95.2645 gbps (1408.9 us)
|
||||
[DEBUG] constant memory bandwidth accessing 32768B unique data is 95.1176 gbps (1411.07 us)
|
||||
[WARN] aspect report ('ConstMemBandwidth') is invalid, a new record is created
|
||||
[INFO] reported 'MaxBandwidth' = '95.2645'
|
||||
[INFO] reported 'MinBandwidth' = '35.8255'
|
||||
[INFO] discovered constant memory read bandwidth min=35.8255; max=95.2645
|
||||
[INFO] reported 'Done' = '1'
|
||||
[INFO] saved data table to 'ConstMemBandwidth.csv'
|
||||
[INFO] [LocalMemBandwidth]
|
||||
[INFO] initialized table for aspect 'LocalMemBandwidth'
|
||||
[INFO] already know that 'LogicThreadCount' from aspect 'Device' is 1024
|
||||
[INFO] already know that 'SmCount' from aspect 'Device' is 2
|
||||
[INFO] already know that 'MaxLocalMemSize' from aspect 'Device' is 32768
|
||||
[DEBUG] local memory bandwidth accessing 16B unique data is 145.757 gbps (460.416 us)
|
||||
[DEBUG] local memory bandwidth accessing 32B unique data is 145.434 gbps (461.44 us)
|
||||
[DEBUG] local memory bandwidth accessing 64B unique data is 139.847 gbps (479.872 us)
|
||||
[DEBUG] local memory bandwidth accessing 128B unique data is 100.708 gbps (666.368 us)
|
||||
[DEBUG] local memory bandwidth accessing 256B unique data is 98.8104 gbps (679.168 us)
|
||||
[DEBUG] local memory bandwidth accessing 512B unique data is 101.37 gbps (662.016 us)
|
||||
[DEBUG] local memory bandwidth accessing 1024B unique data is 101.685 gbps (659.968 us)
|
||||
[DEBUG] local memory bandwidth accessing 2048B unique data is 102.902 gbps (652.16 us)
|
||||
[DEBUG] local memory bandwidth accessing 4096B unique data is 102.923 gbps (652.032 us)
|
||||
[DEBUG] local memory bandwidth accessing 8192B unique data is 103.227 gbps (650.112 us)
|
||||
[DEBUG] local memory bandwidth accessing 16384B unique data is 103.064 gbps (651.136 us)
|
||||
[WARN] aspect report ('LocalMemBandwidth') is invalid, a new record is created
|
||||
[INFO] reported 'MaxBandwidth' = '145.757'
|
||||
[INFO] reported 'MinBandwidth' = '98.8104'
|
||||
[INFO] discovered local memory read bandwidth min=98.8104; max=145.757
|
||||
[INFO] reported 'Done' = '1'
|
||||
[INFO] saved data table to 'LocalMemBandwidth.csv'
|
||||
[INFO] [ImageCacheHierarchyPChase]
|
||||
[INFO] initialized table for aspect 'ImageCacheHierarchyPChase'
|
||||
[INFO] already know that 'MaxImageWidth' from aspect 'Device' is 16384
|
||||
|
|
|
@ -6,7 +6,7 @@ param(
|
|||
)
|
||||
|
||||
if (-not $Arch) {
|
||||
$Arch = "armeabi-v7a"
|
||||
$Arch = "arm64-v8a"
|
||||
}
|
||||
|
||||
if (-not(Test-Path "build-android-$Arch")) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче