Remove unused orttraining amd dockerfiles and scripts (#12707)

2022-09-02 18:43:21 -07:00 · 2022-09-02 18:43:21 -07:00 · 9e47eb68e0
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.10.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.10.pytorch
@ -1,195 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.10.pytorch --tag ort:rocm3.10-pytorch .
-
-FROM rocm/pytorch:rocm3.10_ubuntu18.04_py3.6_pytorch
-
-RUN apt-get -y install gpg-agent
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.9/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-ENV OLD_PATH=${PATH}
-ENV PATH=/usr/bin:${PATH}
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
-
-# python env
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.7
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.7
@ -1,205 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.7 --tag ort:rocm3.7-ort-dev .
-
-FROM rocm/tensorflow:rocm3.7-tf2.1-dev
-
-RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN cat /dev/null > /etc/apt/sources.list.d/rocm.list
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.7/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-# WORKSPACE_DIR
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-# install miniconda (comes with python 3.7 default)
-ARG CONDA_VERSION=4.7.10
-ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
-RUN curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
-    /bin/bash ./install-conda.sh -b -p /opt/conda &&\
-    /opt/conda/bin/conda clean -ya
-ENV PATH=/opt/conda/bin:${PATH}
-
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN conda install -y \
-        numpy=${NUMPY_VERSION} \
-        cmake \
-        ninja \
-        pyyaml \
-        cffi \
-        setuptools \
-    && pip install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar \
-        git+https://github.com/NVIDIA/dllogger \
-        onnx=="${ONNX_VERSION}"
-
-# GITHUB_DIR
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config Release \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-	--enable_training \
-  && test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/Release/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT/ ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-
-# OpenBLAS
-WORKDIR $GITHUB_DIR
-ARG OpenBLAS_VERSION=0.3.10
-ENV OpenBLAS_DIR=$WORKSPACE_DIR/OpenBLAS-${OpenBLAS_VERSION}
-RUN git clone https://github.com/xianyi/OpenBLAS.git \
-  && cd OpenBLAS \
-  && git checkout v$OpenBLAS_VERSION \
-  && make TARGET=ZEN \
-  && make install PREFIX=$OpenBLAS_DIR
-
-# PyTorch
-RUN pip install pyyaml
-RUN for fn in $(find /opt/rocm/ -name \*.cmake ); do sed --in-place='~' 's/find_dependency(hip)/find_dependency(HIP)/' $fn ; done
-WORKDIR $GITHUB_DIR
-# ARG PYTORCH_VERSION=1.6.0
-# RUN git clone --recursive https://github.com/pytorch/pytorch.git \
-#   && cd pytorch \
-#   && git checkout v$PYTORCH_VERSION \
-#   && git submodule update --recursive \
-#   && python3 tools/amd_build/build_amd.py \
-#   && OpenBLAS_HOME=$OpenBLAS_DIR BLAS="OpenBLAS" RCCL_DIR=/opt/rocm/rccl/lib/cmake/rccl/ hip_DIR=/opt/rocm/hip/cmake/ PYTORCH_ROCM_ARCH=gfx906 USE_ROCM=ON USE_CUDA=OFF BUILD_CAFFE2_OPS=0 BUILD_TEST=0 python3 setup.py install
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.7.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.7.pytorch
@ -1,197 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.7.pytorch --tag ort:rocm3.7-pytorch .
-
-FROM rocm/pytorch:rocm3.7_ubuntu18.04_py3.6_pytorch
-#FROM rocm/pytorch:rocm3.7_ubuntu18.04_py3.6_pytorch_gcc
-
-RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN cat /dev/null > /etc/apt/sources.list.d/rocm.list
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.7/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-RUN update-alternatives --remove-all python && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1
-
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-RUN update-alternatives --remove-all python && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.6 1
-
-# python env
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.8.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.8.pytorch
@ -1,196 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.8.pytorch --tag ort:rocm3.8-pytorch .
-
-FROM rocm/pytorch:rocm3.8_ubuntu18.04_py3.6_pytorch
-
-RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN cat /dev/null > /etc/apt/sources.list.d/rocm.list
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.8/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-RUN update-alternatives --remove-all python && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1
-
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-RUN update-alternatives --remove-all python && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.6 1
-
-# python env
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.9
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.9
@ -1,207 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.9 --tag ort:rocm3.9-ort-dev .
-
-FROM rocm/tensorflow:rocm3.9-tf2.3-dev
-
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.9/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-# WORKSPACE_DIR
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-# install miniconda (comes with python 3.9 default)
-ARG CONDA_VERSION=4.7.10
-ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
-RUN curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
-    /bin/bash ./install-conda.sh -b -p /opt/conda &&\
-    /opt/conda/bin/conda clean -ya
-ENV PATH=/opt/conda/bin:${PATH}
-
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN conda install -y \
-        numpy=${NUMPY_VERSION} \
-        cmake \
-        ninja \
-        pyyaml \
-        cffi \
-        setuptools \
-    && pip install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar \
-        git+https://github.com/NVIDIA/dllogger \
-        onnx=="${ONNX_VERSION}"
-
-# GITHUB_DIR
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# Instructions to pull and install the nightly ROCm3.8 PyTorch whl pacakge
-RUN pip3 install --pre torch -f https://download.pytorch.org/whl/nightly/rocm3.9/torch_nightly.html
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy packaging \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm3.9.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm3.9.pytorch
@ -1,199 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm3.9.pytorch --tag ort:rocm3.9-pytorch .
-
-FROM rocm/pytorch:rocm3.9_ubuntu18.04_py3.6_pytorch
-
-RUN apt-get -y install gpg-agent
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.9/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-RUN rm /opt/cache/bin/c++ && \
-    rm /opt/cache/bin/cc && \
-    rm /opt/cache/bin/g++ && \
-    rm /opt/cache/bin/gcc
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-ENV OLD_PATH=${PATH}
-ENV PATH=/usr/bin:${PATH}
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
-
-# python env
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch
@ -1,196 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm4.0.pytorch --tag ort:rocm4.0-pytorch .
-
-FROM rocm/pytorch:rocm4.0_ubuntu18.04_py3.6_pytorch
-
-RUN apt-get -y install gpg-agent
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.0/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-ENV OLD_PATH=${PATH}
-ENV PATH=/usr/bin:${PATH}
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
-
-# python env
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2020-12-06
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-ENV NCCL_DEBUG=INFO
-ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm4.1.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.1.pytorch
@ -1,201 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm4.1.pytorch --tag ort:rocm4.1-pytorch .
-
-FROM rocm/pytorch:rocm4.1_ubuntu18.04_py3.6_pytorch
-
-RUN apt-get -y install gpg-agent
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.1/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-ENV OLD_PATH=${PATH}
-ENV PATH=/usr/bin:${PATH}
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
-
-# python env
-RUN pip3 install --upgrade setuptools
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install \
-  && cd .. \
-  && rm -rf build
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && cd .. \
-  && rm -rf build \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2021-04-02
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config RelWithDebInfo \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-# ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-# ENV NCCL_DEBUG=INFO
-# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch
@ -1,207 +0,0 @@
-# docker build --network=host --file Dockerfile.rocm4.2.pytorch --tag ort:rocm4.2-pytorch .
-
-FROM rocm/pytorch:rocm4.2_ubuntu18.04_py3.6_pytorch_1.8.1
-
-RUN apt-get -y install gpg-agent
-RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.2/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
-
-RUN apt-get -y update
-RUN apt-get -y install apt-utils
-RUN apt-get -y install build-essential autotools-dev \
-    make git curl vim wget rsync jq openssh-server openssh-client sudo \
-    iputils-ping net-tools ethtool libcap2 \
-    automake autoconf libtool flex doxygen \
-    perl lsb-release iproute2 pciutils graphviz \
-    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
-    g++ gcc \
-    && apt-get autoremove
-
-# sh
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
-# Labels for the docker
-LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
-
-# CMake
-ENV CMAKE_VERSION=3.18.2
-RUN cd /usr/local && \
-    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
-ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
-
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-ENV OLD_PATH=${PATH}
-ENV PATH=/usr/bin:${PATH}
-# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
-ENV MOFED_VERSION=5.1-0.6.6.0
-ENV MOFED_OS=ubuntu18.04
-ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
-RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
-    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
-    cd .. && \
-    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
-
-ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
-
-# python env
-RUN pip3 install --upgrade setuptools
-ARG NUMPY_VERSION=1.18.5
-ARG ONNX_VERSION=1.7.0
-RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
-        git+https://github.com/NVIDIA/dllogger \
-        numpy==${NUMPY_VERSION} \
-        onnx=="${ONNX_VERSION}"
-
-ENV GITHUB_DIR=$WORKSPACE_DIR/github
-RUN mkdir -p $GITHUB_DIR
-
-# UCX
-WORKDIR $GITHUB_DIR
-RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
-ARG UCX_VERSION=1.9.0-rc3
-ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
-RUN git clone https://github.com/openucx/ucx.git \
-  && cd ucx \
-  && git checkout v$UCX_VERSION \
-  && ./autogen.sh \
-  && mkdir build \
-  && cd build \
-  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
-  && make -j"$(nproc)" \
-  && make install \
-  && cd .. \
-  && rm -rf build
-
-# OpenMPI
-# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
-# note: disable verbs as we use ucx middleware and don't want btl openib warnings
-WORKDIR $GITHUB_DIR
-ARG OPENMPI_BASEVERSION=4.0
-ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
-ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
-RUN git clone --recursive https://github.com/open-mpi/ompi.git \
-  && cd ompi \
-  && git checkout v$OPENMPI_VERSION \
-  && ./autogen.pl \
-  && mkdir build \
-  && cd build \
-  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
-                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
-                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
-  && make -j"$(nproc)" \
-  && make install \
-  && cd .. \
-  && rm -rf build \
-  && ldconfig \
-  && test -f ${OPENMPI_DIR}/bin/mpic++
-
-ENV PATH=$OPENMPI_DIR/bin:${PATH}
-ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
-    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
-    chmod a+x $OPENMPI_DIR/bin/mpirun
-
-# install mpi4py (be sure to link existing /opt/openmpi-xxx)
-RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
-
-ARG CACHE_DATA=2021-05-18
-
-# ONNX Runtime
-WORKDIR $GITHUB_DIR
-ENV ORT_DIR=$GITHUB_DIR/onnxruntime
-RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
-  && cd onnxruntime \
-  && python3 tools/ci_build/build.py \
-    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
-    --build_dir build \
-    --config Release \
-    --parallel \
-    --skip_tests \
-    --build_wheel \
-    --use_rocm --rocm_home /opt/rocm \
-    --mpi_home $OPENMPI_DIR \
-    --nccl_home /opt/rocm \
-    --enable_training \
-  && test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
-  && pip install $ORT_DIR/build/Release/dist/*.whl \
-  && ldconfig
-
-# ONNX Runtime Training Examples
-WORKDIR $GITHUB_DIR
-ARG GPT2_DATASET=wikitext-103
-RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
-  && cd onnxruntime-training-examples \
-  # Nvidia BERT
-  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
-  && cd DeepLearningExamples \
-  && git checkout cf54b787 \
-  && cd .. \
-  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
-  && rm -rf DeepLearningExamples \
-  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
-  # GPT2 fine-tuning
-  && cd huggingface-gpt2 \
-  && git clone https://github.com/huggingface/transformers.git \
-  && cd transformers \
-  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
-  && cd .. \
-  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
-  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
-  && cd ${WORKSPACE_DIR}/GPT2/transformers \
-  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
-  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
-  && python3 -m pip install --no-cache-dir -e . \
-  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
-  && python3 -m pip install cerberus sympy \
-  && cd .. \
-  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
-  && unzip ${GPT2_DATASET}-v1.zip
-
-ENV BERT_DIR=${WORKSPACE_DIR}/BERT
-ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
-ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
-ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens
-
-RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu
-
-# Huggingface Examples
-WORKDIR $GITHUB_DIR
-RUN git clone https://github.com/microsoft/huggingface-transformers.git
-
-# Enable ssh access without password needed
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
-RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
-
-# Start or Restart sshd service
-ENTRYPOINT service ssh restart && /bin/bash
-
-# Add model and scripts
-ADD model ${WORKSPACE_DIR}/model
-ADD script ${WORKSPACE_DIR}/script
-RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
-
-# add locale en_US.UTF-8
-RUN apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-
-# Workaround an issue in AMD compiler which generates poor GPU ISA
-# when the type of kernel parameter is a structure and “pass-by-value” is used
-# ENV HSA_NO_SCRATCH_RECLAIM=1
-
-# Distributed training related environment variables
-ENV HSA_FORCE_FINE_GRAIN_PCIE=1
-# ENV NCCL_DEBUG=INFO
-# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
-# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
-
-WORKDIR ${WORKSPACE_DIR}/script
--- a/orttraining/tools/amdgpu/model/readme.txt
+++ b/orttraining/tools/amdgpu/model/readme.txt
@ -1 +0,0 @@
-Since BERT-L ONNX file is very big, it is not uploaded to github. So, if you want to build BERT-L ONNX model in your docker image, then you need to copy it to this folder. Otherwise, you need to comment out the line "ADD model ${WORKSPACE_DIR}/model" in the Dockerfile not to add model to Docker image.
				`@ -1 +0,0 @@`
				`Since BERT-L ONNX file is very big, it is not uploaded to github. So, if you want to build BERT-L ONNX model in your docker image, then you need to copy it to this folder. Otherwise, you need to comment out the line "ADD model ${WORKSPACE_DIR}/model" in the Dockerfile not to add model to Docker image.`