зеркало из https://github.com/microsoft/antares.git
add new backend type: c-ocl_nvidia (#194)
This commit is contained in:
Родитель
3ecd049cfa
Коммит
54f99b1284
|
@ -2,6 +2,7 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
//; eval_flags(c-ocl_amdgpu): -I/opt/rocm/opencl/include -L/opt/rocm/opencl/lib -lOpenCL -DCL_TARGET_OPENCL_VERSION=120
|
||||
//; eval_flags(c-ocl_nvidia): -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lOpenCL
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
|
@ -10,6 +11,7 @@ namespace ab {
|
|||
static cl_command_queue cmdqueue;
|
||||
static cl_device_id device_id;
|
||||
static cl_int stat;
|
||||
static size_t max_work_group_size;
|
||||
|
||||
void init(int dev) {
|
||||
cl_uint num_dev;
|
||||
|
@ -28,7 +30,6 @@ namespace ab {
|
|||
cmdqueue = clCreateCommandQueue(context, device_id, 0, &stat), CHECK_OK(stat == 0);
|
||||
|
||||
std::vector<char> dev_name(1024);
|
||||
size_t max_work_group_size = 0;
|
||||
CHECK_OK(0 == clGetDeviceInfo(device_id, CL_DEVICE_NAME, dev_name.size(), dev_name.data(), NULL));
|
||||
CHECK_OK(0 == clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL));
|
||||
fprintf(stderr, " (OCL_INFO: OCL Device Name = %s [max_work_groups: %zd])\n", dev_name.data(), max_work_group_size);
|
||||
|
@ -74,6 +75,8 @@ namespace ab {
|
|||
};
|
||||
size_t lx = query("get_local_id(0)"), ly = query("get_local_id(1)"), lz = query("get_local_id(2)");
|
||||
size_t gx = query("get_group_id(0)"), gy = query("get_group_id(1)"), gz = query("get_group_id(2)");
|
||||
|
||||
CHECK_OK(lx * ly * lz <= max_work_group_size);
|
||||
return { kernel, (void*)(gx * lx), (void*)(gy * ly), (void*)(gz * lz), (void*)lx, (void*)ly, (void*)lz };
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
c-ocl_amdgpu
|
|
@ -74,7 +74,7 @@ int main(int argc, char** argv)
|
|||
printf("\n- K/%d: %.10e\n", i, digest);
|
||||
}
|
||||
|
||||
{
|
||||
do {
|
||||
auto x = ab::recordTime();
|
||||
gm.compute(global_args.data());
|
||||
auto y = ab::recordTime();
|
||||
|
@ -82,8 +82,10 @@ int main(int argc, char** argv)
|
|||
|
||||
double tpr = ab::convertToElapsedTime(x, y);
|
||||
const char *expected_timeout = getenv("EXPECTED_TIMEOUT");
|
||||
if (expected_timeout && *expected_timeout && tpr > std::atof(expected_timeout))
|
||||
throw std::runtime_error(("Time limit exceeded: " + std::to_string(tpr) + " v.s. (expected) " + expected_timeout).c_str());
|
||||
if (expected_timeout && *expected_timeout && tpr > std::atof(expected_timeout)) {
|
||||
printf("\n- TPR: %g\n", tpr);
|
||||
break;
|
||||
}
|
||||
|
||||
int num_runs = (int)std::max(1LU, std::min(10000LU, (unsigned long)(1.0 / tpr)));
|
||||
tpr = 0.0f;
|
||||
|
@ -93,7 +95,7 @@ int main(int argc, char** argv)
|
|||
y = ab::recordTime();
|
||||
tpr = ab::convertToElapsedTime(x, y) / num_runs;
|
||||
printf("\n- TPR: %g\n", tpr);
|
||||
}
|
||||
} while (0);
|
||||
|
||||
ab::finalize();
|
||||
return 0;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
MaxThreadsPerBlock: 256
|
||||
MaxBlockDimX: 256
|
||||
MaxBlockDimY: 256
|
||||
MaxBlockDimZ: 256
|
||||
MaxThreadsPerBlock: 1024
|
||||
MaxBlockDimX: 1024
|
||||
MaxBlockDimY: 1024
|
||||
MaxBlockDimZ: 1024
|
||||
MaxSharedMemoryPerBlock: 65536
|
||||
WarpSize: 32
|
||||
ClockRate: 1000000
|
||||
|
|
Загрузка…
Ссылка в новой задаче