Adding the environment files

2022-05-27 16:43:03 +08:00 · 2022-05-27 16:43:03 +08:00 · 2fffc67400
--- a/cv/aml-cli-v2/data-science/src/environment/Dockerfile
+++ b/cv/aml-cli-v2/data-science/src/environment/Dockerfile
@ -0,0 +1,46 @@
+# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
+FROM nvcr.io/nvidia/pytorch:22.04-py3
+
+##############################################################################
+# NCCL TESTS
+##############################################################################
+ENV NCCL_TESTS_TAG=v2.11.0
+
+# NOTE: adding gencodes to support K80, M60, V100, A100
+RUN mkdir /tmp/nccltests && \
+    cd /tmp/nccltests && \
+    git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
+    cd nccl-tests && \
+    make \
+    MPI=1 MPI_HOME=/opt/hpcx/ompi \
+    NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
+    CUDA_HOME=/usr/local/cuda && \
+    cp ./build/* /usr/local/bin && \
+    rm -rf /tmp/nccltests
+
+# Install dependencies missing in this container
+# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# RUN python -m pip install   azureml-defaults==1.41.0 \
+#     mlflow==1.25.1 \
+#     azureml-mlflow==1.41.0 \
+#     transformers==4.18.0 \
+#     psutil==5.9.0
+
+# add ndv4-topo.xml
+RUN mkdir /opt/microsoft/
+ADD ./ndv4-topo.xml /opt/microsoft
+
+# to use on A100, enable env var below in your job
+# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
+
+# adjusts the level of info from NCCL tests
+ENV NCCL_DEBUG="INFO"
+ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
+
+# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
+ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
+ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
+ENV NCCL_SOCKET_IFNAME="eth0"
--- a/cv/aml-cli-v2/data-science/src/environment/env.yaml
+++ b/cv/aml-cli-v2/data-science/src/environment/env.yaml
@ -0,0 +1,22 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: nvidia_pytorch
+version: 22.04-py3
+build:
+  path: .
+tags:
+  os: ubuntu
+  os_version: 20.04
+  hpcx: 2.10
+  mpi: openmpi
+  mpi_version: 4.1.2rc4
+  ucx: 1.12.0
+  cuda: 11.6.2
+  cudnn: 8.4.0.27
+  nccl: 2.12.10
+  rdma_core: 36.0
+  nsight_compute: 2022.1.1.2
+  nsight_systems: "2022.2.1.31-5fe97ab"
+  nccl_test: 2.11.0
+  azureml-defaults: 1.41.0
+  mlflow: 1.25.1
+  transformers: 4.18.0
--- a/cv/aml-cli-v2/data-science/src/environment/ndv4-topo.xml
+++ b/cv/aml-cli-v2/data-science/src/environment/ndv4-topo.xml
@ -0,0 +1,35 @@
+<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
+<system version="1">
+  <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+      <pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+</system>
--- a/cv/aml-cli-v2/data-science/src/environment/requirements.txt
+++ b/cv/aml-cli-v2/data-science/src/environment/requirements.txt
@ -0,0 +1,14 @@
+ # for local testing (cpu)
+torchvision==0.12.0
+torch==1.11.0
+transformers==4.18.0
+
+# for metrics reporting/plotting
+mlflow==1.25.1
+azureml-mlflow==1.41.0
+matplotlib==3.5.2
+tqdm==4.64.0
+psutil==5.9.0
+
+# for unit testing
+pytest==7.1.2