зеркало из https://github.com/microsoft/hat.git
Some more cuda compiler changes
This commit is contained in:
Родитель
7ed9a2c615
Коммит
4872459307
|
@ -3,12 +3,7 @@ import pathlib
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
# CUDA stuff
|
|
||||||
# TODO: move from pvnrtc module to cuda entirely to reduce dependencies
|
|
||||||
from pynvrtc.compiler import Program
|
|
||||||
from cuda import cuda, nvrtc
|
from cuda import cuda, nvrtc
|
||||||
|
|
||||||
from .arg_info import ArgInfo, verify_args
|
from .arg_info import ArgInfo, verify_args
|
||||||
from .callable_func import CallableFunc
|
from .callable_func import CallableFunc
|
||||||
from .gpu_headers import CUDA_HEADER_MAP
|
from .gpu_headers import CUDA_HEADER_MAP
|
||||||
|
@ -43,16 +38,44 @@ def _find_cuda_incl_path() -> pathlib.Path:
|
||||||
|
|
||||||
return cuda_path
|
return cuda_path
|
||||||
|
|
||||||
|
|
||||||
def compile_cuda_program(cuda_src_path: pathlib.Path, func_name):
|
def compile_cuda_program(cuda_src_path: pathlib.Path, func_name):
|
||||||
src = cuda_src_path.read_text()
|
src = cuda_src_path.read_text()
|
||||||
|
|
||||||
prog = Program(src=src, name=func_name, headers=CUDA_HEADER_MAP.values(), include_names=CUDA_HEADER_MAP.keys())
|
opts = [
|
||||||
ptx = prog.compile([
|
# https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
|
||||||
'-use_fast_math',
|
b'--gpu-architecture=compute_86',
|
||||||
'-default-device',
|
b'--ptxas-options=--warn-on-spills', # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-passing-specific-phase-options-ptxas-options
|
||||||
'-std=c++11'
|
b'-use_fast_math',
|
||||||
])
|
b'--include-path=/usr/local/cuda-11.6/targets/x86_64-linux/include/',
|
||||||
|
b'-std=c++17',
|
||||||
|
b'-default-device',
|
||||||
|
#b'--restrict',
|
||||||
|
#b'--device-int128'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create program
|
||||||
|
err, prog = nvrtc.nvrtcCreateProgram(str.encode(src), func_name.encode('utf-8'), 0, [], [])
|
||||||
|
ASSERT_DRV(err)
|
||||||
|
|
||||||
|
# Compile program
|
||||||
|
err = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
|
||||||
|
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||||
|
err, log_size = nvrtc.nvrtcGetProgramLogSize(prog)
|
||||||
|
ASSERT_DRV(err)
|
||||||
|
|
||||||
|
log = "0" * log_size
|
||||||
|
e_log = log.encode('utf-8')
|
||||||
|
err = nvrtc.nvrtcGetProgramLog(prog, e_log)
|
||||||
|
print(e_log.decode('utf-8'))
|
||||||
|
|
||||||
|
# Get PTX from compilation
|
||||||
|
err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
|
||||||
|
ASSERT_DRV(err)
|
||||||
|
ptx = b" " * ptxSize
|
||||||
|
err = nvrtc.nvrtcGetPTX(prog, ptx)
|
||||||
|
|
||||||
|
# prog = Program(src=src, name=func_name)
|
||||||
|
# ptx = prog.compile(opts)
|
||||||
|
|
||||||
return ptx
|
return ptx
|
||||||
|
|
||||||
|
@ -73,8 +96,9 @@ def initialize_cuda():
|
||||||
|
|
||||||
|
|
||||||
def get_func_from_ptx(ptx, func_name):
|
def get_func_from_ptx(ptx, func_name):
|
||||||
# Note: Incompatible --gpu-architecture would be detected here
|
# Load PTX as module data and retrieve function
|
||||||
err, ptx_mod = cuda.cuModuleLoadData(ptx.encode('utf-8'))
|
ptx = np.char.array(ptx)
|
||||||
|
err, ptx_mod = cuda.cuModuleLoadData(ptx)
|
||||||
ASSERT_DRV(err)
|
ASSERT_DRV(err)
|
||||||
err, kernel = cuda.cuModuleGetFunction(ptx_mod, func_name.encode('utf-8'))
|
err, kernel = cuda.cuModuleGetFunction(ptx_mod, func_name.encode('utf-8'))
|
||||||
ASSERT_DRV(err)
|
ASSERT_DRV(err)
|
||||||
|
@ -82,57 +106,52 @@ def get_func_from_ptx(ptx, func_name):
|
||||||
return kernel
|
return kernel
|
||||||
|
|
||||||
|
|
||||||
def _cuda_transfer_mem(usage, func, source_args: List, dest_args: List, arg_infos: List[ArgInfo], stream=None):
|
def _cuda_transfer_mem(usage, func, source_args: List, dest_args: List, arg_infos: List[ArgInfo]):
|
||||||
for source_arg, dest_arg, arg_info in zip(source_args, dest_args, arg_infos):
|
for source_arg, dest_arg, arg_info in zip(source_args, dest_args, arg_infos):
|
||||||
if usage in arg_info.usage.value:
|
if usage in arg_info.usage.value:
|
||||||
if stream:
|
err, = func(dest_arg, source_arg, arg_info.total_byte_size)
|
||||||
err, = func(dest_arg, source_arg, arg_info.total_byte_size, stream)
|
|
||||||
else:
|
|
||||||
err, = func(dest_arg, source_arg, arg_info.total_byte_size)
|
|
||||||
ASSERT_DRV(err)
|
ASSERT_DRV(err)
|
||||||
|
|
||||||
|
|
||||||
def transfer_mem_host_to_cuda(device_args: List, host_args: List[np.array], arg_infos: List[ArgInfo], stream=None):
|
def transfer_mem_host_to_cuda(device_args: List, host_args: List[np.array], arg_infos: List[ArgInfo]):
|
||||||
_cuda_transfer_mem(
|
_cuda_transfer_mem(
|
||||||
usage='input',
|
usage='input',
|
||||||
func=cuda.cuMemCpyHtoDAsync if stream else cuda.cuMemcpyHtoD,
|
func=cuda.cuMemcpyHtoD,
|
||||||
source_args=[a.ctypes.data for a in host_args],
|
source_args=[a.ctypes.data for a in host_args],
|
||||||
dest_args=device_args,
|
dest_args=device_args,
|
||||||
arg_infos=arg_infos,
|
arg_infos=arg_infos
|
||||||
stream=stream
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def transfer_mem_cuda_to_host(device_args: List, host_args: List[np.array], arg_infos: List[ArgInfo], stream=None):
|
def transfer_mem_cuda_to_host(device_args: List, host_args: List[np.array], arg_infos: List[ArgInfo]):
|
||||||
_cuda_transfer_mem(
|
_cuda_transfer_mem(
|
||||||
usage='output',
|
usage='output',
|
||||||
func=cuda.cuMemcpyDtoHAsync if stream else cuda.cuMemcpyDtoH,
|
func=cuda.cuMemcpyDtoH,
|
||||||
source_args=device_args,
|
source_args=device_args,
|
||||||
dest_args=[a.ctypes.data for a in host_args],
|
dest_args=[a.ctypes.data for a in host_args],
|
||||||
arg_infos=arg_infos,
|
arg_infos=arg_infos
|
||||||
stream=stream
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def allocate_cuda_mem(arg_infos: List[ArgInfo], stream=None):
|
def allocate_cuda_mem(arg_infos: List[ArgInfo]):
|
||||||
device_mem = []
|
device_mem = []
|
||||||
|
|
||||||
for arg in arg_infos:
|
for arg in arg_infos:
|
||||||
size = arg.total_byte_size
|
size = arg.total_byte_size
|
||||||
err, mem = cuda.cuMemAllocAsync(size, stream) if stream else cuda.cuMemAlloc(size)
|
err, mem = cuda.cuMemAlloc(size)
|
||||||
try:
|
try:
|
||||||
ASSERT_DRV(err)
|
ASSERT_DRV(err)
|
||||||
except:
|
except:
|
||||||
free_cuda_mem(device_mem, stream)
|
free_cuda_mem(device_mem)
|
||||||
raise
|
raise
|
||||||
device_mem.append(mem)
|
device_mem.append(mem)
|
||||||
|
|
||||||
return device_mem
|
return device_mem
|
||||||
|
|
||||||
|
|
||||||
def free_cuda_mem(args, stream=None):
|
def free_cuda_mem(args):
|
||||||
for arg in args:
|
for arg in args:
|
||||||
cuda.cuMemFreeAsync(arg, stream) if stream else cuda.cuMemFree(arg)
|
cuda.cuMemFree(arg)
|
||||||
|
|
||||||
|
|
||||||
def device_args_to_ptr_list(device_args: List):
|
def device_args_to_ptr_list(device_args: List):
|
||||||
|
|
Загрузка…
Ссылка в новой задаче