From 54f99b128407c222cb4e2547d1d9635a8e6ca9f2 Mon Sep 17 00:00:00 2001 From: ghostplant Date: Thu, 4 Mar 2021 23:26:10 +0800 Subject: [PATCH] add new backend type: c-ocl_nvidia (#194) --- backends/c-ocl_amdgpu/include/backend.hpp | 5 ++++- backends/c-ocl_nvidia | 1 + graph_evaluator/run_graph.cpp | 10 ++++++---- hardware/OCL_DEFAULT.cfg | 8 ++++---- 4 files changed, 15 insertions(+), 9 deletions(-) create mode 120000 backends/c-ocl_nvidia diff --git a/backends/c-ocl_amdgpu/include/backend.hpp b/backends/c-ocl_amdgpu/include/backend.hpp index 3b454c6..3918ec3 100644 --- a/backends/c-ocl_amdgpu/include/backend.hpp +++ b/backends/c-ocl_amdgpu/include/backend.hpp @@ -2,6 +2,7 @@ // Licensed under the MIT License. //; eval_flags(c-ocl_amdgpu): -I/opt/rocm/opencl/include -L/opt/rocm/opencl/lib -lOpenCL -DCL_TARGET_OPENCL_VERSION=120 +//; eval_flags(c-ocl_nvidia): -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lOpenCL #include @@ -10,6 +11,7 @@ namespace ab { static cl_command_queue cmdqueue; static cl_device_id device_id; static cl_int stat; + static size_t max_work_group_size; void init(int dev) { cl_uint num_dev; @@ -28,7 +30,6 @@ namespace ab { cmdqueue = clCreateCommandQueue(context, device_id, 0, &stat), CHECK_OK(stat == 0); std::vector dev_name(1024); - size_t max_work_group_size = 0; CHECK_OK(0 == clGetDeviceInfo(device_id, CL_DEVICE_NAME, dev_name.size(), dev_name.data(), NULL)); CHECK_OK(0 == clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL)); fprintf(stderr, " (OCL_INFO: OCL Device Name = %s [max_work_groups: %zd])\n", dev_name.data(), max_work_group_size); @@ -74,6 +75,8 @@ namespace ab { }; size_t lx = query("get_local_id(0)"), ly = query("get_local_id(1)"), lz = query("get_local_id(2)"); size_t gx = query("get_group_id(0)"), gy = query("get_group_id(1)"), gz = query("get_group_id(2)"); + + CHECK_OK(lx * ly * lz <= max_work_group_size); return { kernel, (void*)(gx * lx), (void*)(gy * ly), (void*)(gz * lz), (void*)lx, (void*)ly, (void*)lz }; } diff --git a/backends/c-ocl_nvidia b/backends/c-ocl_nvidia new file mode 120000 index 0000000..fd15f7a --- /dev/null +++ b/backends/c-ocl_nvidia @@ -0,0 +1 @@ +c-ocl_amdgpu \ No newline at end of file diff --git a/graph_evaluator/run_graph.cpp b/graph_evaluator/run_graph.cpp index 4d1af4c..caa8e50 100644 --- a/graph_evaluator/run_graph.cpp +++ b/graph_evaluator/run_graph.cpp @@ -74,7 +74,7 @@ int main(int argc, char** argv) printf("\n- K/%d: %.10e\n", i, digest); } - { + do { auto x = ab::recordTime(); gm.compute(global_args.data()); auto y = ab::recordTime(); @@ -82,8 +82,10 @@ int main(int argc, char** argv) double tpr = ab::convertToElapsedTime(x, y); const char *expected_timeout = getenv("EXPECTED_TIMEOUT"); - if (expected_timeout && *expected_timeout && tpr > std::atof(expected_timeout)) - throw std::runtime_error(("Time limit exceeded: " + std::to_string(tpr) + " v.s. (expected) " + expected_timeout).c_str()); + if (expected_timeout && *expected_timeout && tpr > std::atof(expected_timeout)) { + printf("\n- TPR: %g\n", tpr); + break; + } int num_runs = (int)std::max(1LU, std::min(10000LU, (unsigned long)(1.0 / tpr))); tpr = 0.0f; @@ -93,7 +95,7 @@ int main(int argc, char** argv) y = ab::recordTime(); tpr = ab::convertToElapsedTime(x, y) / num_runs; printf("\n- TPR: %g\n", tpr); - } + } while (0); ab::finalize(); return 0; diff --git a/hardware/OCL_DEFAULT.cfg b/hardware/OCL_DEFAULT.cfg index f02f05d..e1f0183 100644 --- a/hardware/OCL_DEFAULT.cfg +++ b/hardware/OCL_DEFAULT.cfg @@ -1,7 +1,7 @@ -MaxThreadsPerBlock: 256 -MaxBlockDimX: 256 -MaxBlockDimY: 256 -MaxBlockDimZ: 256 +MaxThreadsPerBlock: 1024 +MaxBlockDimX: 1024 +MaxBlockDimY: 1024 +MaxBlockDimZ: 1024 MaxSharedMemoryPerBlock: 65536 WarpSize: 32 ClockRate: 1000000