onnxruntime/tools/ci_build/amd_hipify.py

209 строки
9.9 KiB
Python

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import os
import subprocess
def hipify(hipify_perl_path, src_file_path, dst_file_path):
dir_name = os.path.dirname(dst_file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
# Run hipify-perl first, capture output
s = subprocess.run([hipify_perl_path, "-roc", src_file_path], stdout=subprocess.PIPE, text=True, check=False).stdout
# Additional exact-match replacements.
# Order matters for all of the following replacements, reglardless of appearing in logical sections.
s = s.replace("kCudaExecutionProvider", "kRocmExecutionProvider")
s = s.replace("CUDAStreamType", "HIPStreamType")
s = s.replace("kCudaStreamDefault", "kHipStreamDefault")
s = s.replace("kCudaStreamCopyIn", "kHipStreamCopyIn")
s = s.replace("kCudaStreamCopyOut", "kHipStreamCopyOut")
s = s.replace("kTotalCudaStreams", "kTotalHipStreams")
# We want rocblas interfaces, not hipblas. Also force some hipify replacements back to rocblas from hipblas.
s = s.replace("CublasHandle", "RocblasHandle")
s = s.replace("cublas_handle", "rocblas_handle")
s = s.replace("hipblasHandle_t", "rocblas_handle")
s = s.replace("hipblasDatatype_t", "rocblas_datatype")
s = s.replace("HIPBLAS_STATUS_SUCCESS", "rocblas_status_success")
s = s.replace("hipblasStatus_t", "rocblas_status")
s = s.replace("hipblasCreate", "rocblas_create_handle")
s = s.replace("hipblasDestroy", "rocblas_destroy_handle")
s = s.replace("hipblasSetStream", "rocblas_set_stream")
s = s.replace("HIPBLAS_OP_T", "rocblas_operation_transpose")
s = s.replace("HIPBLAS_OP_N", "rocblas_operation_none")
# in rocm 6.0, hipify-perl, the -roc option also maps __half -> rocblas_half which we don't want
s = s.replace("rocblas_half", "__half")
s = s.replace("RegisterCudaContribKernels", "RegisterRocmContribKernels")
s = s.replace("cudaEvent", "hipEvent")
s = s.replace("CreateCudaAllocator", "CreateRocmAllocator")
s = s.replace("CudaErrString", "RocmErrString")
s = s.replace("CudaAsyncBuffer", "RocmAsyncBuffer")
s = s.replace("CudaKernel", "RocmKernel")
s = s.replace("CudaStream", "RocmStream")
s = s.replace("ToCudaType", "ToHipType")
s = s.replace("CudaT", "HipT")
s = s.replace("CUDA_LONG", "HIP_LONG")
s = s.replace("CUDA_RETURN_IF_ERROR", "HIP_RETURN_IF_ERROR")
s = s.replace("CUDA_KERNEL_ASSERT", "HIP_KERNEL_ASSERT")
s = s.replace("CUDA_CALL", "HIP_CALL")
s = s.replace("SliceCuda", "SliceRocm")
s = s.replace("thrust::cuda", "thrust::hip")
s = s.replace("CudaCall", "RocmCall")
s = s.replace("cuda", "rocm")
# s = s.replace('Cuda', 'Rocm')
s = s.replace("CUDA", "ROCM")
s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64")
s = s.replace("std::exp", "expf")
s = s.replace("std::log", "logf")
s = s.replace("WaitCudaNotificationOnDevice", "WaitRocmNotificationOnDevice")
s = s.replace("hipHostAlloc", "hipHostMalloc")
s = s.replace(
"#include <cub/device/device_radix_sort.cuh>",
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
)
s = s.replace(
'#include "cub/device/device_radix_sort.cuh"',
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
)
s = s.replace(
"#include <cub/device/device_segmented_radix_sort.cuh>",
"#include <hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp>",
)
s = s.replace(
"#include <cub/device/device_reduce.cuh>", "#include <hipcub/backend/rocprim/device/device_reduce.hpp>"
)
s = s.replace(
"#include <cub/device/device_run_length_encode.cuh>",
"#include <hipcub/backend/rocprim/device/device_run_length_encode.hpp>",
)
s = s.replace("#include <cub/device/device_scan.cuh>", "#include <hipcub/backend/rocprim/device/device_scan.hpp>")
s = s.replace(
"#include <cub/iterator/counting_input_iterator.cuh>",
"#include <hipcub/backend/rocprim/iterator/counting_input_iterator.hpp>",
)
s = s.replace(
"#include <cub/iterator/discard_output_iterator.cuh>",
"#include <hipcub/backend/rocprim/iterator/discard_output_iterator.hpp>",
)
s = s.replace("#include <cub/util_allocator.cuh>", "#include <hipcub/util_allocator.hpp>")
s = s.replace('#include "cub/util_allocator.cuh"', "#include <hipcub/util_allocator.hpp>")
s = s.replace("#include <cub/util_type.cuh>", "#include <hipcub/backend/rocprim/util_type.hpp>")
s = s.replace('#include "cub/util_type.cuh"', "#include <hipcub/backend/rocprim/util_type.hpp>")
s = s.replace("#include <cub/device/device_partition.cuh>", "#include <hipcub/device/device_partition.hpp>")
s = s.replace("#include <math_constants.h>", "#include <limits>")
s = s.replace("#include <library_types.h>", "") # Doesn't exist
s = s.replace("typedef half MappedType", "typedef __half MappedType")
# CUBLAS -> HIPBLAS
# Note: We do not use the hipblas marshalling interfaces; use rocblas instead.
# s = s.replace('CUBLAS', 'HIPBLAS')
# s = s.replace('Cublas', 'Hipblas')
# s = s.replace('cublas', 'hipblas')
# CUBLAS -> ROCBLAS
s = s.replace("CUBLAS", "ROCBLAS")
s = s.replace("Cublas", "Rocblas")
s = s.replace("cublas", "rocblas")
# Undefined ROCMRT constants -> std::numeric_limits
s = s.replace("ROCMRT_INF_F", "std::numeric_limits<float>::infinity()")
# HIPBLAS -> rocblas
s = s.replace("HIPBLAS_R_16F", "rocblas_datatype_f16_r")
s = s.replace("HIPBLAS_R_32F", "rocblas_datatype_f32_r")
s = s.replace("ROCBLAS_GEMM_DEFAULT_TENSOR_OP", "rocblas_gemm_algo_standard")
# compatible layer
s = s.replace("rocblas_gemm_strided_batched_ex", "_compat_rocblas_gemm_strided_batched_ex")
s = s.replace("RocblasMathModeSetter", "CompatRocblasMathModeSetter")
# CURAND -> HIPRAND
s = s.replace("CURAND", "HIPRAND")
s = s.replace("Curand", "Hiprand")
s = s.replace("curand", "hiprand")
# NCCL -> RCCL
# s = s.replace('NCCL_CALL', 'RCCL_CALL')
s = s.replace("#include <nccl.h>", "#include <rccl/rccl.h>")
# CUDNN -> MIOpen
s = s.replace("CUDNN", "MIOPEN")
s = s.replace("Cudnn", "Miopen")
s = s.replace("cudnn", "miopen")
# hipify seems to have a bug for MIOpen, cudnn.h -> hipDNN.h, cudnn -> hipdnn
s = s.replace("#include <hipDNN.h>", "#include <miopen/miopen.h>")
s = s.replace("hipdnn", "miopen")
s = s.replace("HIPDNN_STATUS_SUCCESS", "miopenStatusSuccess")
s = s.replace("HIPDNN", "MIOPEN")
s = s.replace("MIOPEN_BATCHNORM_SPATIAL", "miopenBNSpatial")
s = s.replace("MIOPEN_BATCHNORM_PER_ACTIVATION", "miopenBNPerActivation")
s = s.replace("MIOPEN_LRN_CROSS_CHANNEL", "miopenLRNCrossChannel")
s = s.replace("MIOPEN_POOLING_MAX", "miopenPoolingMax")
s = s.replace("MIOPEN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "miopenPoolingAverageInclusive")
s = s.replace("MIOPEN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "miopenPoolingAverage")
# CUSPARSE -> HIPSPARSE
s = s.replace("CUSPARSE", "HIPSPARSE")
# CUFFT -> HIPFFT
s = s.replace("CUFFT", "HIPFFT")
s = s.replace("cufftXtMakePlanMany", "hipfftXtMakePlanMany")
s = s.replace("cufftXtExec", "hipfftXtExec")
# Undo where above hipify steps went too far.
s = s.replace("id, ROCM", "id, CUDA") # cuda_execution_provider.cc
s = s.replace("ROCM error executing", "HIP error executing")
s = s.replace("ROCM_PINNED", "CUDA_PINNED")
s = s.replace("rocm_err", "hip_err")
s = s.replace("RegisterHipTrainingKernels", "RegisterRocmTrainingKernels")
s = s.replace("ROCM_VERSION", "CUDA_VERSION") # semantically different meanings, cannot hipify
s = s.replace("__ROCM_ARCH__", "__CUDA_ARCH__") # semantically different meanings, cannot hipify
# "std::log" above incorrectly changed "std::logic_error" to "logfic_error"
s = s.replace("logfic_error", "std::logic_error")
# Deletions
s = s.replace('#include "device_atomic_functions.h"', "") # HIP atomics in main hip header already
# Fix warnings due to incorrect header paths, intentionally after all other hipify steps.
s = s.replace("#include <hiprand_kernel.h>", "#include <hiprand/hiprand_kernel.h>")
s = s.replace("#include <rocblas.h>", "#include <rocblas/rocblas.h>")
s = s.replace("#include <hipblas.h>", "#include <hipblas/hipblas.h>")
s = s.replace("#include <hipfft.h>", "#include <hipfft/hipfft.h>")
s = s.replace('#include "hipfft.h"', "#include <hipfft/hipfft.h>")
s = s.replace('#include "hipfftXt.h"', "#include <hipfft/hipfftXt.h>")
# Fix onnxruntime/contrib_ops/rocm/transformers. They include cpu headers which use "cuda" in their names.
s = s.replace("rocm_device_prop_", "cuda_device_prop_")
s = s.replace("rocm_device_arch_", "cuda_device_arch_")
s = s.replace("HipTuningContext", "RocmTuningContext")
# We want hipfft, which needs hipDataType etc, but only do this for files that have "fft" in their names
# And we do this last, undoing or fixing hipify mistakes.
if "fft" in src_file_path:
s = s.replace("rocblas_datatype", "hipDataType")
s = s.replace("hipDataType_f32_c", "HIP_C_32F")
s = s.replace("hipDataType_f32_r", "HIP_R_32F")
s = s.replace("hipDataType_f64_c", "HIP_C_64F")
s = s.replace("hipDataType_f64_r", "HIP_R_64F")
s = s.replace("hipDataType_f16_c", "HIP_C_16F")
s = s.replace("hipDataType_f16_r", "HIP_R_16F")
with open(dst_file_path, "w") as f:
f.write(s)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--hipify_perl", required=True)
parser.add_argument("--output", "-o", help="output file")
parser.add_argument("src", help="src")
args = parser.parse_args()
hipify(args.hipify_perl, args.src, args.output)