Merge pull request #390 from Microsoft/xiongyf/mpi-dev

Support OpenMPI
This commit is contained in:
Yifan Xiong 2018-03-30 14:41:35 +08:00 коммит произвёл GitHub
Родитель b228ebf424 ae159acfe7
Коммит ed3d49a269
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 179 добавлений и 35 удалений

Просмотреть файл

@ -0,0 +1,48 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.'
# tag: pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
#
# Mpi image to build for the system.
# Before building this image you need to build the base image first:
#
# docker build -f Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
ENV OPENMPI_VERSION=1.10.4
WORKDIR /
# Install Open MPI
RUN OPENMPI_SHA1="84d035e7ab1572e5ebc086049f05b694d2158844" && \
wget -q https://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-${OPENMPI_VERSION}.tar.gz && \
echo "$OPENMPI_SHA1 openmpi-${OPENMPI_VERSION}.tar.gz" | sha1sum --check --strict - && \
tar -xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/mpi --enable-mpirun-prefix-by-default && \
make -j $(nproc) install && \
cd .. && \
rm -rf openmpi-${OPENMPI_VERSION} && \
rm -rf openmpi-${OPENMPI_VERSION}.tar.gz
ENV PATH=/usr/local/mpi/bin:$PATH \
LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
WORKDIR /root

Просмотреть файл

@ -23,7 +23,7 @@
# docker build -f Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
FROM pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
ENV CNTK_VERSION=2.0.beta11.0
@ -49,19 +49,6 @@ RUN git clone --recursive https://github.com/Microsoft/hdfs-mount.git && \
cd .. && \
rm -rf hdfs-mount
# Install Open MPI
RUN OPENMPI_VERSION=1.10.4 && \
OPENMPI_SHA1="84d035e7ab1572e5ebc086049f05b694d2158844" && \
wget -q https://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-${OPENMPI_VERSION}.tar.gz && \
echo "$OPENMPI_SHA1 openmpi-${OPENMPI_VERSION}.tar.gz" | sha1sum --check --strict - && \
tar -xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/mpi && \
make -j $(nproc) install && \
cd .. && \
rm -rf openmpi-${OPENMPI_VERSION} && \
rm -rf openmpi-${OPENMPI_VERSION}.tar.gz
# Install Anaconda
RUN ANACONDA_PREFIX="/root/anaconda3" && \
ANACONDA_VERSION="3-4.1.1" && \
@ -83,7 +70,7 @@ RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict - && \
tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh
wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh \
-O /cntk/Scripts/install/linux/install-cntk-docker.sh && \
/bin/bash /cntk/Scripts/install/linux/install-cntk-docker.sh && \
/root/anaconda3/bin/conda clean --all --yes && \

Просмотреть файл

@ -0,0 +1,32 @@
{
"jobName": "cntk-distributed-jobguid",
"image": "pai.run.cntk",
// prepare cmudict corpus in CNTK format https://git.io/vbT5A and upload to hdfs
"dataDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/data",
// make a new dir for output on hdfs
"outputDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/output",
// prepare g2p distributed training script cntk-distributed-example.sh and upload to hdfs
"codeDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/code",
"taskRoles": [
{
"name": "mpi",
"taskNumber": 1,
"cpuNumber": 8,
"memoryMB": 16384,
"gpuNumber": 0,
"command": "cd code && mpirun --allow-run-as-root -np 2 --host worker-0,worker-1 /bin/bash cntk-distributed-example.sh"
},
{
"name": "worker",
"taskNumber": 2,
"cpuNumber": 8,
"memoryMB": 16384,
"gpuNumber": 2,
"command": "/bin/bash"
}
],
"killAllOnCompletedTaskNumber": 1,
"retryCount": 0
}

Просмотреть файл

@ -0,0 +1,42 @@
#!/bin/bash
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Example script for distributed CNTK job
trap "kill 0" EXIT
# hdfs address in IP:PORT format
hdfs_addr=$(sed -e "s@hdfs://@@g" <<< $PAI_DEFAULT_FS_URI)
# hdfs mount point
mnt_point=/mnt/hdfs
# mount hdfs as a local file system
mkdir -p $mnt_point
hdfs-mount $hdfs_addr $mnt_point &
export DATA_DIR=$(sed -e "s@$PAI_DEFAULT_FS_URI@$mnt_point@g" <<< $PAI_DATA_DIR)
export OUTPUT_DIR=$(sed -e "s@$PAI_DEFAULT_FS_URI@$mnt_point@g" <<< $PAI_OUTPUT_DIR)
# prepare CNTK distributed BrainScript and upload to hdfs
# please refer to CNTK G2P example and brainscript parallel training docs for details
# https://github.com/Microsoft/CNTK/tree/master/Examples/SequenceToSequence/CMUDict/BrainScript
# https://docs.microsoft.com/en-us/cognitive-toolkit/Multiple-GPUs-and-machines#3-configuring-parallel-training-in-cntk-in-brainscript
cntk configFile=g2p-distributed.cntk parallelTrain=true DataDir=$DATA_DIR OutDir=$OUTPUT_DIR

Просмотреть файл

@ -26,7 +26,7 @@ BASH_XTRACEFD=17
function exit_handler()
{
printf "%s %s\n" \
"[ERROR]" "EXIT signal received in docker container, exiting ..."
"[DEBUG]" "EXIT signal received in docker container, exiting ..."
set +x
exec 17>&-
hdfs dfs -put /tmp/pai_dockercontainer_$PAI_CONTAINER_ID.log \
@ -130,15 +130,15 @@ export PAI_CURRENT_CONTAINER_PORT=$PAI_CONTAINER_HOST_PORT
function prepare_ssh()
{
mkdir /root/.ssh
sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
mkdir /root/.ssh
sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
}
function start_ssh_service()
{
printf "%s %s\n" \
"[INFO]" "start ssh service"
"[INFO]" "start ssh service"
cat /root/.ssh/$APP_ID.pub >> /root/.ssh/authorized_keys
sed -i 's/Port.*/Port '$PAI_CONTAINER_SSH_PORT'/' /etc/ssh/sshd_config
echo "sshd:ALL" >> /etc/hosts.allow
@ -147,18 +147,18 @@ function start_ssh_service()
function hdfs_upload_atomically()
{
printf "%s %s\n%s %s\n%s %s\n" \
printf "%s %s\n%s %s\n%s %s\n" \
"[INFO]" "upload ssh key to hdfs" \
"[INFO]" "destination path is ${2}" \
"[INFO]" "source path is ${1}"
tempFolder=${2}"_temp"
if hdfs dfs -test -d $tempFolder ; then
printf "%s %s\n" \
"[WARNING]" "$tempFolder already exists, overwriting..."
hdfs dfs -rm -r $tempFolder || exit 1
fi
hdfs dfs -put ${1} $tempFolder || exit 1
hdfs dfs -mv $tempFolder ${2} || exit 1
tempFolder=${2}"_temp"
if hdfs dfs -test -d $tempFolder ; then
printf "%s %s\n" \
"[WARNING]" "$tempFolder already exists, overwriting..."
hdfs dfs -rm -r $tempFolder || exit 1
fi
hdfs dfs -put ${1} $tempFolder || exit 1
hdfs dfs -mv $tempFolder ${2} || exit 1
}
# Start sshd in docker container
@ -183,18 +183,53 @@ else
sleep 10
done
printf "%s %s\n%s %s\n" \
"[INFO]" "ssh key pair ready ..." \
"[INFO]" "begin to download ssh key pair from hdfs ..."
"[INFO]" "ssh key pair ready ..." \
"[INFO]" "begin to download ssh key pair from hdfs ..."
hdfs dfs -get "${hdfs_ssh_folder}/.ssh/" "/root/" || exit 1
fi
# Start ssh service
start_ssh_service
chmod 400 ~/.ssh/$APP_ID
# Generate ssh connect info file in "PAI_CONTAINER_ID-PAI_CURRENT_CONTAINER_IP-PAI_CONTAINER_SSH_PORT" format on hdfs
hdfs dfs -touchz ${hdfs_ssh_folder}/$PAI_CONTAINER_ID-$PAI_CONTAINER_HOST_IP-$PAI_CONTAINER_SSH_PORT || exit 1
# Write env to bashrc
env | sed "s/^/export /" >> ~/.bashrc
# Generate ssh config
ssh_config_path=${HDFS_LAUNCHER_PREFIX}/${PAI_USER_NAME}/${PAI_JOB_NAME}/ssh/config
hdfs dfs -mkdir -p ${ssh_config_path} || exit 1
hdfs dfs -touchz ${ssh_config_path}/$APP_ID+$PAI_CURRENT_TASK_ROLE_NAME+$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX+$PAI_CONTAINER_HOST_IP+$PAI_CONTAINER_SSH_PORT || exit 1
while [ `hdfs dfs -ls $ssh_config_path | grep "/$PAI_JOB_NAME/ssh/config/$APP_ID+" | wc -l` -lt $PAI_JOB_TASK_COUNT ]; do
printf "%s %s\n" "[INFO]" "Waiting for ssh service in other containers ..."
sleep 10
done
NodeList=($(hdfs dfs -ls ${ssh_config_path} \
| grep "/$PAI_JOB_NAME/ssh/config/$APP_ID+" \
| grep -oE "[^/]+$" \
| sed -e "s/^$APP_ID+//g" \
| sort -n))
if [ "${#NodeList[@]}" -ne $PAI_JOB_TASK_COUNT ]; then
printf "%s %s\n%s\n%s\n\n" \
"[ERROR]" "NodeList" \
"${NodeList[@]}" \
"ssh services in ${#NodeList[@]} containers are available, not equal to $PAI_JOB_TASK_COUNT, exit ..."
exit 2
fi
for line in "${NodeList[@]}"; do
node=(${line//+/ });
printf "%s\n %s\n %s\n %s\n %s\n %s\n %s\n" \
"Host ${node[0]}-${node[1]}" \
"HostName ${node[2]}" \
"Port ${node[3]}" \
"User root" \
"StrictHostKeyChecking no" \
"UserKnownHostsFile /dev/null" \
"IdentityFile /root/.ssh/$APP_ID" >> /root/.ssh/config
done
# Start ssh service
start_ssh_service
# Write env to system-wide environment
env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment
sleep 10
printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
{{{ taskData.command }}} || exit $?
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"