зеркало из https://github.com/Azure/aks-gpu.git
support grid drivers, add 515 cuda drivers (#3)
This commit is contained in:
Родитель
8e58eb36d8
Коммит
4e74c8d456
|
@ -6,11 +6,12 @@ on:
|
|||
workflow_dispatch: {}
|
||||
|
||||
jobs:
|
||||
build:
|
||||
cuda:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["470.57.02", "510.47.03"]
|
||||
driver_version: ["470.82.01", "510.47.03", "515.65.01"]
|
||||
driver_kind: ["cuda"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
|
@ -27,7 +28,7 @@ jobs:
|
|||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
|
@ -38,7 +39,87 @@ jobs:
|
|||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
- name: Move cache
|
||||
run: |
|
||||
rm -r /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
grid470:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["470.82.01"]
|
||||
driver_kind: ["grid"]
|
||||
driver_url: ["https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
|
||||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
echo "version is ${{ steps.semver.outputs.version }}"
|
||||
echo "version is ${{ steps.semver.outputs.version_tag }}"
|
||||
- name: 'Build and Push'
|
||||
run: |
|
||||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
- name: Move cache
|
||||
run: |
|
||||
rm -r /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
grid510:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["510.73.08"]
|
||||
driver_kind: ["grid"]
|
||||
driver_url: ["https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
|
||||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
echo "version is ${{ steps.semver.outputs.version }}"
|
||||
echo "version is ${{ steps.semver.outputs.version_tag }}"
|
||||
- name: 'Build and Push'
|
||||
run: |
|
||||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
- name: Move cache
|
||||
run: |
|
||||
|
|
|
@ -9,11 +9,12 @@ permissions:
|
|||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
cuda:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["470.57.02", "510.47.03"]
|
||||
driver_version: ["470.82.01", "510.47.03", "515.65.01"]
|
||||
driver_kind: ["cuda"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
|
@ -30,29 +31,103 @@ jobs:
|
|||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
echo "version is ${{ steps.semver.outputs.version }}"
|
||||
echo "version is ${{ steps.semver.outputs.version_tag }}"
|
||||
- name: 'Azure CLI login'
|
||||
uses: azure/login@v1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
|
||||
- name: 'Build and Push'
|
||||
run: |
|
||||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
|
||||
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }}
|
||||
echo "acr push done"
|
||||
- name: Move cache
|
||||
run: |
|
||||
rm -r /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
grid470:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["470.82.01"]
|
||||
driver_kind: ["grid"]
|
||||
driver_url: ["https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
|
||||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
echo "version is ${{ steps.semver.outputs.version }}"
|
||||
echo "version is ${{ steps.semver.outputs.version_tag }}"
|
||||
- name: 'Build and Push'
|
||||
run: |
|
||||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
- name: Move cache
|
||||
run: |
|
||||
rm -r /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
grid510:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
driver_version: ["510.73.08"]
|
||||
driver_kind: ["grid"]
|
||||
driver_url: ["https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
|
||||
- uses: paulhatch/semantic-version@v5.0.0-alpha2
|
||||
with:
|
||||
bump_each_commit: false
|
||||
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
|
||||
id: semver
|
||||
- name: 'Check version'
|
||||
run: |
|
||||
echo "version is ${{ steps.semver.outputs.version }}"
|
||||
echo "version is ${{ steps.semver.outputs.version_tag }}"
|
||||
- name: 'Build and Push'
|
||||
run: |
|
||||
set -x
|
||||
echo "tag is: "
|
||||
echo ${{ steps.semver.outputs.version }}
|
||||
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
|
||||
docker images
|
||||
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
|
||||
docker images
|
||||
# docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }}
|
||||
- name: Move cache
|
||||
run: |
|
||||
rm -r /tmp/.buildx-cache
|
||||
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
[plugins."io.containerd.grpc.v1.cri"]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
default_runtime_name = "nvidia-container-runtime"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime]
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted]
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
|
@ -5,8 +5,11 @@ FROM mcr.microsoft.com/mirror/docker/library/ubuntu:${distro} as gpu
|
|||
RUN apt update && apt install -y curl xz-utils gnupg2 ca-certificates gettext-base --no-install-recommends
|
||||
|
||||
ARG DRIVER_VERSION
|
||||
ARG DRIVER_URL
|
||||
ARG DRIVER_KIND="cuda"
|
||||
|
||||
WORKDIR /opt/gpu
|
||||
COPY 10-nvidia-runtime.toml 10-nvidia-runtime.toml
|
||||
COPY blacklist-nouveau.conf blacklist-nouveau.conf
|
||||
COPY fm_run_package_installer.sh fm_run_package_installer.sh
|
||||
COPY config.sh config.sh
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
DRIVER_VERSION="${DRIVER_VERSION}"
|
||||
DRIVER_KIND="${DRIVER_KIND}"
|
||||
NVIDIA_DOCKER_VERSION="2.8.0-1"
|
||||
NVIDIA_CONTAINER_RUNTIME_VERSION="3.11.0"
|
||||
NVIDIA_CONTAINER_TOOLKIT_VER="1.11.0"
|
||||
|
|
32
download.sh
32
download.sh
|
@ -7,21 +7,33 @@ source /opt/gpu/config.sh
|
|||
workdir="$(mktemp -d)"
|
||||
pushd "$workdir" || exit
|
||||
|
||||
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
|
||||
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}"
|
||||
curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/${RUNFILE}.run
|
||||
elif [[ "${DRIVER_KIND}" == "grid" ]]; then
|
||||
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
|
||||
curl -fsSLO "${DRIVER_URL}"
|
||||
else
|
||||
echo "Invalid driver kind: ${DRIVER_KIND}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# download nvidia drivers, move to permanent cache
|
||||
curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
|
||||
mv NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
|
||||
# TODO: reenable this, it saves like 30sec. but it pushes vhd to capacity and starts to fail image pulls :(
|
||||
mv ${RUNFILE}.run /opt/gpu/${RUNFILE}.run
|
||||
pushd /opt/gpu
|
||||
# extract runfile, takes some time, so do ahead of time
|
||||
sh /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run -x
|
||||
rm /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
|
||||
sh /opt/gpu/${RUNFILE}.run -x
|
||||
rm /opt/gpu/${RUNFILE}.run
|
||||
popd
|
||||
|
||||
# download fabricmanager for nvlink based systems, e.g. multi instance gpu vms.
|
||||
curl -fsSLO https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
|
||||
tar -xvf fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
|
||||
mv fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}
|
||||
mv /opt/gpu/fm_run_package_installer.sh /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
|
||||
|
||||
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
|
||||
# download fabricmanager for nvlink based systems, e.g. multi instance gpu vms.
|
||||
curl -fsSLO https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
|
||||
tar -xvf fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
|
||||
mv fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}
|
||||
mv /opt/gpu/fm_run_package_installer.sh /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
|
||||
fi
|
||||
|
||||
# configure nvidia apt repo to cache packages
|
||||
curl -fsSLO https://nvidia.github.io/nvidia-docker/gpgkey
|
||||
|
|
|
@ -5,6 +5,8 @@ set -o nounset
|
|||
|
||||
set -x
|
||||
|
||||
sleep="${2:-}"
|
||||
|
||||
if [[ -z "${1}" ]]; then
|
||||
echo "Must provide a non-empty action as first argument"
|
||||
exit 1
|
||||
|
@ -51,3 +53,11 @@ else
|
|||
echo "Failed during nsenter command execution"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${sleep}" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Sleeping forever"
|
||||
|
||||
sleep infinity
|
||||
|
|
32
install.sh
32
install.sh
|
@ -8,8 +8,17 @@ PS4='+ $(date -u -I"seconds" | cut -c1-19) '
|
|||
KERNEL_NAME=$(uname -r)
|
||||
LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log"
|
||||
|
||||
set +euo pipefail
|
||||
open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
|
||||
echo "Open devices: $open_devices"
|
||||
|
||||
open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
|
||||
echo "Open gridd: $open_gridd"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# host needs these tools to build and load kernel module, can remove ca-certificates, was only for testing
|
||||
apt update && apt install -y kmod gcc make dkms initramfs-tools ca-certificates linux-headers-$(uname -r) --no-install-recommends
|
||||
apt install -y kmod gcc make dkms initramfs-tools ca-certificates linux-headers-$(uname -r) --no-install-recommends
|
||||
|
||||
# install cached nvidia debian packages for container runtime compatibility
|
||||
for apt_package in $NVIDIA_PACKAGES; do
|
||||
|
@ -25,7 +34,7 @@ update-initramfs -u
|
|||
set +e
|
||||
umount -l /usr/lib/x86_64-linux-gnu || true
|
||||
umount -l /tmp/overlay || true
|
||||
rm -r /tmp/overlay
|
||||
rm -r /tmp/overlay || true
|
||||
set -e
|
||||
|
||||
# set up overlayfs to change install location of nvidia libs from /usr/lib/x86_64-linux-gnu to /usr/local/nvidia
|
||||
|
@ -36,9 +45,18 @@ mkdir /tmp/overlay/{workdir,lib64}
|
|||
mkdir -p ${GPU_DEST}/lib64
|
||||
mount -t overlay overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/x86_64-linux-gnu
|
||||
|
||||
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
|
||||
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}"
|
||||
elif [[ "${DRIVER_KIND}" == "grid" ]]; then
|
||||
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
|
||||
else
|
||||
echo "Invalid driver kind: ${DRIVER_KIND}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# install nvidia drivers
|
||||
pushd /opt/gpu
|
||||
/opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}"
|
||||
/opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}"
|
||||
popd
|
||||
|
||||
# move nvidia libs to correct location from temporary overlayfs
|
||||
|
@ -66,7 +84,11 @@ nvidia-smi
|
|||
cp -r /opt/gpu/nvidia-docker2_${NVIDIA_DOCKER_VERSION}/* /usr/
|
||||
|
||||
# install fabricmanager for nvlink based systems
|
||||
bash /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
|
||||
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
|
||||
bash /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
|
||||
fi
|
||||
|
||||
mkdir -p /etc/containerd/config.d
|
||||
cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
|
||||
|
||||
du -hs /opt/gpu
|
||||
rm -r /opt/gpu
|
||||
|
|
31
justfile
31
justfile
|
@ -1,13 +1,26 @@
|
|||
nv_470_driver := "470.57.02"
|
||||
nv_510_driver := "510.47.03"
|
||||
grid_470_url := "https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"
|
||||
grid_510_url := "https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"
|
||||
grid_510_driver := "510.73.08"
|
||||
grid_470_driver := "470.82.01"
|
||||
cuda_510_driver := "510.47.03"
|
||||
cuda_470_driver := "470.82.01"
|
||||
cuda_515_driver := "515.65.01"
|
||||
registry := "docker.io/alexeldeib"
|
||||
|
||||
default: push
|
||||
default:
|
||||
|
||||
push: (build)
|
||||
docker push {{ registry }}/aks-gpu:{{ nv_470_driver }}
|
||||
docker push {{ registry }}/aks-gpu:{{ nv_510_driver }}
|
||||
pushallcuda: (pushcuda cuda_515_driver) (pushcuda cuda_510_driver) (pushcuda cuda_470_driver)
|
||||
|
||||
build:
|
||||
docker build --build-arg DRIVER_VERSION={{ nv_470_driver }} -f Dockerfile -t {{ registry }}/aks-gpu:{{ nv_470_driver }} .
|
||||
docker build --build-arg DRIVER_VERSION={{ nv_510_driver }} -f Dockerfile -t {{ registry }}/aks-gpu:{{ nv_510_driver }} .
|
||||
pushallgrid: (pushgrid grid_510_driver grid_510_url) #(pushgrid grid_470_driver grid_470_url)
|
||||
|
||||
pushcuda VERSION: (buildcuda VERSION)
|
||||
docker push {{ registry }}/aks-gpu:{{VERSION}}-cuda
|
||||
|
||||
pushgrid VERSION URL: (buildgrid VERSION URL)
|
||||
docker push {{ registry }}/aks-gpu:{{VERSION}}-grid
|
||||
|
||||
buildgrid VERSION URL:
|
||||
docker build --build-arg DRIVER_URL={{URL}} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION={{VERSION}} -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-grid .
|
||||
|
||||
buildcuda VERSION:
|
||||
docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION={{VERSION}} -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-cuda .
|
||||
|
|
|
@ -21,17 +21,15 @@ spec:
|
|||
# - key: node.kubernetes.io/instance-type
|
||||
# operator: In
|
||||
# values:
|
||||
# - Standard_NP10s
|
||||
# - Standard_NP20s
|
||||
# - Standard_NP40s
|
||||
# - Standard_NV6ads_A10_v5
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
containers:
|
||||
- image: docker.io/alexeldeib/aks-gpu:latest # requires an image with bash, curl, sleep, and nsenter (vanilla ubuntu works)
|
||||
- image: mcr.microsoft.com/aks/aks-gpu:${TAG}
|
||||
imagePullPolicy: Always
|
||||
name: *name
|
||||
command: ["/entrypoint.sh"]
|
||||
args: ["install.sh"] # if you don't use my image or build one from Dockerfile, set this to "downloadandinstall"
|
||||
args: ["install", "sleep"]
|
||||
resources:
|
||||
requests:
|
||||
{}
|
||||
|
@ -40,8 +38,6 @@ spec:
|
|||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- name: actions
|
||||
mountPath: "/opt/actions"
|
||||
- name: hostmount
|
||||
mountPath: "/mnt/actions"
|
||||
- name: gpu
|
||||
|
@ -55,7 +51,4 @@ spec:
|
|||
hostPath:
|
||||
path: /opt/actions
|
||||
type: DirectoryOrCreate
|
||||
- name: actions
|
||||
configMap:
|
||||
name: nsenter-actions
|
||||
---
|
Загрузка…
Ссылка в новой задаче