support grid drivers, add 515 cuda drivers (#3)

This commit is contained in:
Ace Eldeib 2022-09-15 14:53:19 -04:00 коммит произвёл GitHub
Родитель 8e58eb36d8
Коммит 4e74c8d456
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 273 добавлений и 52 удалений

89
.github/workflows/ci.yaml поставляемый
Просмотреть файл

@ -6,11 +6,12 @@ on:
workflow_dispatch: {}
jobs:
build:
cuda:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["470.57.02", "510.47.03"]
driver_version: ["470.82.01", "510.47.03", "515.65.01"]
driver_kind: ["cuda"]
steps:
- uses: actions/checkout@v2
with:
@ -27,7 +28,7 @@ jobs:
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
@ -38,7 +39,87 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
rm -r /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
grid470:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["470.82.01"]
driver_kind: ["grid"]
driver_url: ["https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"]
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Cache Docker layers
uses: actions/cache@v2
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
echo "version is ${{ steps.semver.outputs.version }}"
echo "version is ${{ steps.semver.outputs.version_tag }}"
- name: 'Build and Push'
run: |
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
rm -r /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
grid510:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["510.73.08"]
driver_kind: ["grid"]
driver_url: ["https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"]
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Cache Docker layers
uses: actions/cache@v2
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
echo "version is ${{ steps.semver.outputs.version }}"
echo "version is ${{ steps.semver.outputs.version_tag }}"
- name: 'Build and Push'
run: |
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |

103
.github/workflows/main.yaml поставляемый
Просмотреть файл

@ -9,11 +9,12 @@ permissions:
contents: read
jobs:
publish:
cuda:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["470.57.02", "510.47.03"]
driver_version: ["470.82.01", "510.47.03", "515.65.01"]
driver_kind: ["cuda"]
steps:
- uses: actions/checkout@v2
with:
@ -30,29 +31,103 @@ jobs:
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
echo "version is ${{ steps.semver.outputs.version }}"
echo "version is ${{ steps.semver.outputs.version_tag }}"
- name: 'Azure CLI login'
uses: azure/login@v1
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: 'Build and Push'
run: |
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }}
echo "acr push done"
- name: Move cache
run: |
rm -r /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
grid470:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["470.82.01"]
driver_kind: ["grid"]
driver_url: ["https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"]
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Cache Docker layers
uses: actions/cache@v2
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
echo "version is ${{ steps.semver.outputs.version }}"
echo "version is ${{ steps.semver.outputs.version_tag }}"
- name: 'Build and Push'
run: |
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
rm -r /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
grid510:
runs-on: ubuntu-latest
strategy:
matrix:
driver_version: ["510.73.08"]
driver_kind: ["grid"]
driver_url: ["https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"]
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Cache Docker layers
uses: actions/cache@v2
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ matrix.driver_version }}
- uses: paulhatch/semantic-version@v5.0.0-alpha2
with:
bump_each_commit: false
version_format: "${{ matrix.driver_kind}}-${{ matrix.driver_version }}-sha-${GITHUB_SHA:0:6}"
id: semver
- name: 'Check version'
run: |
echo "version is ${{ steps.semver.outputs.version }}"
echo "version is ${{ steps.semver.outputs.version_tag }}"
- name: 'Build and Push'
run: |
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ matrix.driver_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ matrix.driver_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} .
docker images
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
docker images
# docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }}
- name: Move cache
run: |
rm -r /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache

11
10-nvidia-runtime.toml Normal file
Просмотреть файл

@ -0,0 +1,11 @@
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia-container-runtime"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options]
BinaryName = "/usr/bin/nvidia-container-runtime"

Просмотреть файл

@ -5,8 +5,11 @@ FROM mcr.microsoft.com/mirror/docker/library/ubuntu:${distro} as gpu
RUN apt update && apt install -y curl xz-utils gnupg2 ca-certificates gettext-base --no-install-recommends
ARG DRIVER_VERSION
ARG DRIVER_URL
ARG DRIVER_KIND="cuda"
WORKDIR /opt/gpu
COPY 10-nvidia-runtime.toml 10-nvidia-runtime.toml
COPY blacklist-nouveau.conf blacklist-nouveau.conf
COPY fm_run_package_installer.sh fm_run_package_installer.sh
COPY config.sh config.sh

Просмотреть файл

@ -1,4 +1,5 @@
DRIVER_VERSION="${DRIVER_VERSION}"
DRIVER_KIND="${DRIVER_KIND}"
NVIDIA_DOCKER_VERSION="2.8.0-1"
NVIDIA_CONTAINER_RUNTIME_VERSION="3.11.0"
NVIDIA_CONTAINER_TOOLKIT_VER="1.11.0"

Просмотреть файл

@ -7,21 +7,33 @@ source /opt/gpu/config.sh
workdir="$(mktemp -d)"
pushd "$workdir" || exit
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}"
curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/${RUNFILE}.run
elif [[ "${DRIVER_KIND}" == "grid" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
curl -fsSLO "${DRIVER_URL}"
else
echo "Invalid driver kind: ${DRIVER_KIND}"
exit 1
fi
# download nvidia drivers, move to permanent cache
curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
mv NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
# TODO: reenable this, it saves like 30sec. but it pushes vhd to capacity and starts to fail image pulls :(
mv ${RUNFILE}.run /opt/gpu/${RUNFILE}.run
pushd /opt/gpu
# extract runfile, takes some time, so do ahead of time
sh /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run -x
rm /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run
sh /opt/gpu/${RUNFILE}.run -x
rm /opt/gpu/${RUNFILE}.run
popd
# download fabricmanager for nvlink based systems, e.g. multi instance gpu vms.
curl -fsSLO https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
tar -xvf fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
mv fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}
mv /opt/gpu/fm_run_package_installer.sh /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
# download fabricmanager for nvlink based systems, e.g. multi instance gpu vms.
curl -fsSLO https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
tar -xvf fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz
mv fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}
mv /opt/gpu/fm_run_package_installer.sh /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
fi
# configure nvidia apt repo to cache packages
curl -fsSLO https://nvidia.github.io/nvidia-docker/gpgkey

Просмотреть файл

@ -5,6 +5,8 @@ set -o nounset
set -x
sleep="${2:-}"
if [[ -z "${1}" ]]; then
echo "Must provide a non-empty action as first argument"
exit 1
@ -51,3 +53,11 @@ else
echo "Failed during nsenter command execution"
exit 1
fi
if [[ -z "${sleep}" ]]; then
exit 0
fi
echo "Sleeping forever"
sleep infinity

Просмотреть файл

@ -8,8 +8,17 @@ PS4='+ $(date -u -I"seconds" | cut -c1-19) '
KERNEL_NAME=$(uname -r)
LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log"
set +euo pipefail
open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
echo "Open devices: $open_devices"
open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
echo "Open gridd: $open_gridd"
set -euo pipefail
# host needs these tools to build and load kernel module, can remove ca-certificates, was only for testing
apt update && apt install -y kmod gcc make dkms initramfs-tools ca-certificates linux-headers-$(uname -r) --no-install-recommends
apt install -y kmod gcc make dkms initramfs-tools ca-certificates linux-headers-$(uname -r) --no-install-recommends
# install cached nvidia debian packages for container runtime compatibility
for apt_package in $NVIDIA_PACKAGES; do
@ -25,7 +34,7 @@ update-initramfs -u
set +e
umount -l /usr/lib/x86_64-linux-gnu || true
umount -l /tmp/overlay || true
rm -r /tmp/overlay
rm -r /tmp/overlay || true
set -e
# set up overlayfs to change install location of nvidia libs from /usr/lib/x86_64-linux-gnu to /usr/local/nvidia
@ -36,9 +45,18 @@ mkdir /tmp/overlay/{workdir,lib64}
mkdir -p ${GPU_DEST}/lib64
mount -t overlay overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/x86_64-linux-gnu
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}"
elif [[ "${DRIVER_KIND}" == "grid" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
else
echo "Invalid driver kind: ${DRIVER_KIND}"
exit 1
fi
# install nvidia drivers
pushd /opt/gpu
/opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}"
/opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}"
popd
# move nvidia libs to correct location from temporary overlayfs
@ -66,7 +84,11 @@ nvidia-smi
cp -r /opt/gpu/nvidia-docker2_${NVIDIA_DOCKER_VERSION}/* /usr/
# install fabricmanager for nvlink based systems
bash /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
bash /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
fi
mkdir -p /etc/containerd/config.d
cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
du -hs /opt/gpu
rm -r /opt/gpu

Просмотреть файл

@ -1,13 +1,26 @@
nv_470_driver := "470.57.02"
nv_510_driver := "510.47.03"
grid_470_url := "https://download.microsoft.com/download/a/3/c/a3c078a0-e182-4b61-ac9b-ac011dc6ccf4/NVIDIA-Linux-x86_64-470.82.01-grid-azure.run"
grid_510_url := "https://download.microsoft.com/download/6/2/5/625e22a0-34ea-4d03-8738-a639acebc15e/NVIDIA-Linux-x86_64-510.73.08-grid-azure.run"
grid_510_driver := "510.73.08"
grid_470_driver := "470.82.01"
cuda_510_driver := "510.47.03"
cuda_470_driver := "470.82.01"
cuda_515_driver := "515.65.01"
registry := "docker.io/alexeldeib"
default: push
default:
push: (build)
docker push {{ registry }}/aks-gpu:{{ nv_470_driver }}
docker push {{ registry }}/aks-gpu:{{ nv_510_driver }}
pushallcuda: (pushcuda cuda_515_driver) (pushcuda cuda_510_driver) (pushcuda cuda_470_driver)
build:
docker build --build-arg DRIVER_VERSION={{ nv_470_driver }} -f Dockerfile -t {{ registry }}/aks-gpu:{{ nv_470_driver }} .
docker build --build-arg DRIVER_VERSION={{ nv_510_driver }} -f Dockerfile -t {{ registry }}/aks-gpu:{{ nv_510_driver }} .
pushallgrid: (pushgrid grid_510_driver grid_510_url) #(pushgrid grid_470_driver grid_470_url)
pushcuda VERSION: (buildcuda VERSION)
docker push {{ registry }}/aks-gpu:{{VERSION}}-cuda
pushgrid VERSION URL: (buildgrid VERSION URL)
docker push {{ registry }}/aks-gpu:{{VERSION}}-grid
buildgrid VERSION URL:
docker build --build-arg DRIVER_URL={{URL}} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION={{VERSION}} -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-grid .
buildcuda VERSION:
docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION={{VERSION}} -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-cuda .

Просмотреть файл

@ -21,17 +21,15 @@ spec:
# - key: node.kubernetes.io/instance-type
# operator: In
# values:
# - Standard_NP10s
# - Standard_NP20s
# - Standard_NP40s
# - Standard_NV6ads_A10_v5
hostNetwork: true
hostPID: true
containers:
- image: docker.io/alexeldeib/aks-gpu:latest # requires an image with bash, curl, sleep, and nsenter (vanilla ubuntu works)
- image: mcr.microsoft.com/aks/aks-gpu:${TAG}
imagePullPolicy: Always
name: *name
command: ["/entrypoint.sh"]
args: ["install.sh"] # if you don't use my image or build one from Dockerfile, set this to "downloadandinstall"
args: ["install", "sleep"]
resources:
requests:
{}
@ -40,8 +38,6 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: actions
mountPath: "/opt/actions"
- name: hostmount
mountPath: "/mnt/actions"
- name: gpu
@ -55,7 +51,4 @@ spec:
hostPath:
path: /opt/actions
type: DirectoryOrCreate
- name: actions
configMap:
name: nsenter-actions
---