Adding the environment files
This commit is contained in:
Родитель
e68f2b5644
Коммит
2fffc67400
|
@ -0,0 +1,46 @@
|
|||
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
|
||||
FROM nvcr.io/nvidia/pytorch:22.04-py3
|
||||
|
||||
##############################################################################
|
||||
# NCCL TESTS
|
||||
##############################################################################
|
||||
ENV NCCL_TESTS_TAG=v2.11.0
|
||||
|
||||
# NOTE: adding gencodes to support K80, M60, V100, A100
|
||||
RUN mkdir /tmp/nccltests && \
|
||||
cd /tmp/nccltests && \
|
||||
git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
|
||||
cd nccl-tests && \
|
||||
make \
|
||||
MPI=1 MPI_HOME=/opt/hpcx/ompi \
|
||||
NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
|
||||
CUDA_HOME=/usr/local/cuda && \
|
||||
cp ./build/* /usr/local/bin && \
|
||||
rm -rf /tmp/nccltests
|
||||
|
||||
# Install dependencies missing in this container
|
||||
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
|
||||
COPY requirements.txt ./
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# RUN python -m pip install azureml-defaults==1.41.0 \
|
||||
# mlflow==1.25.1 \
|
||||
# azureml-mlflow==1.41.0 \
|
||||
# transformers==4.18.0 \
|
||||
# psutil==5.9.0
|
||||
|
||||
# add ndv4-topo.xml
|
||||
RUN mkdir /opt/microsoft/
|
||||
ADD ./ndv4-topo.xml /opt/microsoft
|
||||
|
||||
# to use on A100, enable env var below in your job
|
||||
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
|
||||
|
||||
# adjusts the level of info from NCCL tests
|
||||
ENV NCCL_DEBUG="INFO"
|
||||
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
|
||||
|
||||
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
|
||||
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
ENV NCCL_SOCKET_IFNAME="eth0"
|
|
@ -0,0 +1,22 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
|
||||
name: nvidia_pytorch
|
||||
version: 22.04-py3
|
||||
build:
|
||||
path: .
|
||||
tags:
|
||||
os: ubuntu
|
||||
os_version: 20.04
|
||||
hpcx: 2.10
|
||||
mpi: openmpi
|
||||
mpi_version: 4.1.2rc4
|
||||
ucx: 1.12.0
|
||||
cuda: 11.6.2
|
||||
cudnn: 8.4.0.27
|
||||
nccl: 2.12.10
|
||||
rdma_core: 36.0
|
||||
nsight_compute: 2022.1.1.2
|
||||
nsight_systems: "2022.2.1.31-5fe97ab"
|
||||
nccl_test: 2.11.0
|
||||
azureml-defaults: 1.41.0
|
||||
mlflow: 1.25.1
|
||||
transformers: 4.18.0
|
|
@ -0,0 +1,35 @@
|
|||
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
|
||||
<system version="1">
|
||||
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
|
@ -0,0 +1,14 @@
|
|||
# for local testing (cpu)
|
||||
torchvision==0.12.0
|
||||
torch==1.11.0
|
||||
transformers==4.18.0
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
azureml-mlflow==1.41.0
|
||||
matplotlib==3.5.2
|
||||
tqdm==4.64.0
|
||||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
Загрузка…
Ссылка в новой задаче