This commit is contained in:
Setu Chokshi 2022-05-27 16:43:03 +08:00
Родитель e68f2b5644
Коммит 2fffc67400
4 изменённых файлов: 117 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,46 @@
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
FROM nvcr.io/nvidia/pytorch:22.04-py3
##############################################################################
# NCCL TESTS
##############################################################################
ENV NCCL_TESTS_TAG=v2.11.0
# NOTE: adding gencodes to support K80, M60, V100, A100
RUN mkdir /tmp/nccltests && \
cd /tmp/nccltests && \
git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
cd nccl-tests && \
make \
MPI=1 MPI_HOME=/opt/hpcx/ompi \
NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
CUDA_HOME=/usr/local/cuda && \
cp ./build/* /usr/local/bin && \
rm -rf /tmp/nccltests
# Install dependencies missing in this container
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
COPY requirements.txt ./
RUN pip install -r requirements.txt
# RUN python -m pip install azureml-defaults==1.41.0 \
# mlflow==1.25.1 \
# azureml-mlflow==1.41.0 \
# transformers==4.18.0 \
# psutil==5.9.0
# add ndv4-topo.xml
RUN mkdir /opt/microsoft/
ADD ./ndv4-topo.xml /opt/microsoft
# to use on A100, enable env var below in your job
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
# adjusts the level of info from NCCL tests
ENV NCCL_DEBUG="INFO"
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
ENV NCCL_SOCKET_IFNAME="eth0"

Просмотреть файл

@ -0,0 +1,22 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: nvidia_pytorch
version: 22.04-py3
build:
path: .
tags:
os: ubuntu
os_version: 20.04
hpcx: 2.10
mpi: openmpi
mpi_version: 4.1.2rc4
ucx: 1.12.0
cuda: 11.6.2
cudnn: 8.4.0.27
nccl: 2.12.10
rdma_core: 36.0
nsight_compute: 2022.1.1.2
nsight_systems: "2022.2.1.31-5fe97ab"
nccl_test: 2.11.0
azureml-defaults: 1.41.0
mlflow: 1.25.1
transformers: 4.18.0

Просмотреть файл

@ -0,0 +1,35 @@
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
<system version="1">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
</system>

Просмотреть файл

@ -0,0 +1,14 @@
# for local testing (cpu)
torchvision==0.12.0
torch==1.11.0
transformers==4.18.0
# for metrics reporting/plotting
mlflow==1.25.1
azureml-mlflow==1.41.0
matplotlib==3.5.2
tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2