Dockerfile - Upgrade Docker image to CUDA 12.2 (#577)

Upgrade Docker image to CUDA 12.2 for H100:
* upgrade base image to 23.10
* fix onnxruntime version in python3.10
* fix compilation errors
This commit is contained in:
Yifan Xiong 2023-11-22 21:48:18 +08:00 коммит произвёл GitHub
Родитель 2235e084ab
Коммит 1ad1c21c38
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 25 добавлений и 23 удалений

6
.github/workflows/build-image.yml поставляемый
Просмотреть файл

@ -24,9 +24,9 @@ jobs:
strategy:
matrix:
include:
- name: cuda12.1
dockerfile: cuda12.1
tags: superbench/main:cuda12.1
- name: cuda12.2
dockerfile: cuda12.2
tags: superbench/main:cuda12.2
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest

Просмотреть файл

@ -1,16 +1,16 @@
FROM nvcr.io/nvidia/pytorch:23.03-py3
FROM nvcr.io/nvidia/pytorch:23.10-py3
# OS:
# - Ubuntu: 20.04
# - OpenMPI: 4.1.5a1
# - Ubuntu: 22.04
# - OpenMPI: 4.1.5rc2
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 12.1.0
# - cuDNN: 8.8.1.3
# - NCCL: v2.17.1-1
# - CUDA: 12.2.2
# - cuDNN: 8.9.5
# - NCCL: v2.19.3-1
# Mellanox:
# - OFED: 5.2-2.2.3.0 # TODO
# - HPC-X: v2.14
# - OFED: 23.07-0.5.1.2
# - HPC-X: v2.16
# Intel:
# - mlc: v3.10
@ -74,20 +74,20 @@ RUN mkdir -p /root/.ssh && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
# Install OFED
ENV OFED_VERSION=5.2-2.2.3.0
ENV OFED_VERSION=23.07-0.5.1.2
RUN cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X
ENV HPCX_VERSION=v2.14
ENV HPCX_VERSION=v2.16
RUN cd /opt && \
rm -rf hpcx && \
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64 hpcx && \
rm hpcx.tbz
# Install Intel MLC
@ -131,7 +131,8 @@ ADD third_party third_party
RUN make -C third_party cuda
ADD . .
RUN python3 -m pip install --no-cache-dir .[nvworker] && \
RUN python3 -m pip install --upgrade setuptools==65.7 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git

Просмотреть файл

@ -213,7 +213,8 @@ setup(
],
'ort': [
'onnx>=1.10.2',
'onnxruntime-gpu==1.10.0',
'onnxruntime-gpu==1.10.0; python_version<"3.10"',
'onnxruntime-gpu; python_version>="3.10"',
],
'nvidia': ['py3nvml>=0.2.6'],
}

Просмотреть файл

@ -366,8 +366,8 @@ void CublasFunction::matrix_calculation_on_cpu_with_data(const T1 *Parameter_0_0
for (int j = 0; j < n; j++) {
(*Result_cpu)[i + j * m + b * m * n] = beta * (T2)(Result_3_0_host[i + j * m + b * m * n]);
for (int p = 0; p < k; p++) {
(*Result_cpu)[i + j * m + b * m * n] +=
Parameter_0_0_host_op[p * m + i + b * m * k] * Parameter_1_0_host_op[j * k + p + b * k * n];
(*Result_cpu)[i + j * m + b * m * n] += (T2)(Parameter_0_0_host_op[p * m + i + b * m * k] *
Parameter_1_0_host_op[j * k + p + b * k * n]);
(*Result_cpu)[i + j * m + b * m * n] *= alpha;
}
}
@ -444,7 +444,7 @@ int CublasFunction::check_result(int batch_count, T1 *Result_3_0, T2 *Result_cpu
// |<x, y>_cpu - <x,y>_gpu|/|<x, y>_cpu|/dot_length < eps
int error_count = 0;
for (int i = 0; i < static_cast<int>(m * n) * batch_count; i++) {
double abs_err = fabs(Result_cpu[i] - Result_3_0_host[i]);
double abs_err = fabs(Result_cpu[i] - (T2)(Result_3_0_host[i]));
double dot_length = k;
double abs_val = fabs(Result_cpu[i]);
double rel_err = abs_err / abs_val / dot_length;