зеркало из https://github.com/microsoft/pai.git
Коммит
ed3d49a269
|
@ -0,0 +1,48 @@
|
|||
# Copyright (c) Microsoft Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
|
||||
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
|
||||
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.'
|
||||
|
||||
|
||||
# tag: pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
|
||||
#
|
||||
# Mpi image to build for the system.
|
||||
# Before building this image you need to build the base image first:
|
||||
#
|
||||
# docker build -f Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
|
||||
|
||||
|
||||
FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
|
||||
|
||||
ENV OPENMPI_VERSION=1.10.4
|
||||
|
||||
WORKDIR /
|
||||
|
||||
# Install Open MPI
|
||||
RUN OPENMPI_SHA1="84d035e7ab1572e5ebc086049f05b694d2158844" && \
|
||||
wget -q https://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-${OPENMPI_VERSION}.tar.gz && \
|
||||
echo "$OPENMPI_SHA1 openmpi-${OPENMPI_VERSION}.tar.gz" | sha1sum --check --strict - && \
|
||||
tar -xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
|
||||
cd openmpi-${OPENMPI_VERSION} && \
|
||||
./configure --prefix=/usr/local/mpi --enable-mpirun-prefix-by-default && \
|
||||
make -j $(nproc) install && \
|
||||
cd .. && \
|
||||
rm -rf openmpi-${OPENMPI_VERSION} && \
|
||||
rm -rf openmpi-${OPENMPI_VERSION}.tar.gz
|
||||
|
||||
ENV PATH=/usr/local/mpi/bin:$PATH \
|
||||
LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
|
||||
|
||||
WORKDIR /root
|
|
@ -23,7 +23,7 @@
|
|||
# docker build -f Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
|
||||
|
||||
|
||||
FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
|
||||
FROM pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
|
||||
|
||||
ENV CNTK_VERSION=2.0.beta11.0
|
||||
|
||||
|
@ -49,19 +49,6 @@ RUN git clone --recursive https://github.com/Microsoft/hdfs-mount.git && \
|
|||
cd .. && \
|
||||
rm -rf hdfs-mount
|
||||
|
||||
# Install Open MPI
|
||||
RUN OPENMPI_VERSION=1.10.4 && \
|
||||
OPENMPI_SHA1="84d035e7ab1572e5ebc086049f05b694d2158844" && \
|
||||
wget -q https://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-${OPENMPI_VERSION}.tar.gz && \
|
||||
echo "$OPENMPI_SHA1 openmpi-${OPENMPI_VERSION}.tar.gz" | sha1sum --check --strict - && \
|
||||
tar -xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
|
||||
cd openmpi-${OPENMPI_VERSION} && \
|
||||
./configure --prefix=/usr/local/mpi && \
|
||||
make -j $(nproc) install && \
|
||||
cd .. && \
|
||||
rm -rf openmpi-${OPENMPI_VERSION} && \
|
||||
rm -rf openmpi-${OPENMPI_VERSION}.tar.gz
|
||||
|
||||
# Install Anaconda
|
||||
RUN ANACONDA_PREFIX="/root/anaconda3" && \
|
||||
ANACONDA_VERSION="3-4.1.1" && \
|
||||
|
@ -83,7 +70,7 @@ RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
|
|||
echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict - && \
|
||||
tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
|
||||
rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
|
||||
wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh
|
||||
wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh \
|
||||
-O /cntk/Scripts/install/linux/install-cntk-docker.sh && \
|
||||
/bin/bash /cntk/Scripts/install/linux/install-cntk-docker.sh && \
|
||||
/root/anaconda3/bin/conda clean --all --yes && \
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"jobName": "cntk-distributed-jobguid",
|
||||
"image": "pai.run.cntk",
|
||||
|
||||
// prepare cmudict corpus in CNTK format https://git.io/vbT5A and upload to hdfs
|
||||
"dataDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/data",
|
||||
// make a new dir for output on hdfs
|
||||
"outputDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/output",
|
||||
// prepare g2p distributed training script cntk-distributed-example.sh and upload to hdfs
|
||||
"codeDir": "$PAI_DEFAULT_FS_URI/path/cntk-distributed-jobguid/code",
|
||||
|
||||
"taskRoles": [
|
||||
{
|
||||
"name": "mpi",
|
||||
"taskNumber": 1,
|
||||
"cpuNumber": 8,
|
||||
"memoryMB": 16384,
|
||||
"gpuNumber": 0,
|
||||
"command": "cd code && mpirun --allow-run-as-root -np 2 --host worker-0,worker-1 /bin/bash cntk-distributed-example.sh"
|
||||
},
|
||||
{
|
||||
"name": "worker",
|
||||
"taskNumber": 2,
|
||||
"cpuNumber": 8,
|
||||
"memoryMB": 16384,
|
||||
"gpuNumber": 2,
|
||||
"command": "/bin/bash"
|
||||
}
|
||||
],
|
||||
"killAllOnCompletedTaskNumber": 1,
|
||||
"retryCount": 0
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Microsoft Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
|
||||
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
|
||||
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
# Example script for distributed CNTK job
|
||||
|
||||
trap "kill 0" EXIT
|
||||
|
||||
# hdfs address in IP:PORT format
|
||||
hdfs_addr=$(sed -e "s@hdfs://@@g" <<< $PAI_DEFAULT_FS_URI)
|
||||
|
||||
# hdfs mount point
|
||||
mnt_point=/mnt/hdfs
|
||||
|
||||
# mount hdfs as a local file system
|
||||
mkdir -p $mnt_point
|
||||
hdfs-mount $hdfs_addr $mnt_point &
|
||||
export DATA_DIR=$(sed -e "s@$PAI_DEFAULT_FS_URI@$mnt_point@g" <<< $PAI_DATA_DIR)
|
||||
export OUTPUT_DIR=$(sed -e "s@$PAI_DEFAULT_FS_URI@$mnt_point@g" <<< $PAI_OUTPUT_DIR)
|
||||
|
||||
|
||||
# prepare CNTK distributed BrainScript and upload to hdfs
|
||||
# please refer to CNTK G2P example and brainscript parallel training docs for details
|
||||
# https://github.com/Microsoft/CNTK/tree/master/Examples/SequenceToSequence/CMUDict/BrainScript
|
||||
# https://docs.microsoft.com/en-us/cognitive-toolkit/Multiple-GPUs-and-machines#3-configuring-parallel-training-in-cntk-in-brainscript
|
||||
cntk configFile=g2p-distributed.cntk parallelTrain=true DataDir=$DATA_DIR OutDir=$OUTPUT_DIR
|
|
@ -26,7 +26,7 @@ BASH_XTRACEFD=17
|
|||
function exit_handler()
|
||||
{
|
||||
printf "%s %s\n" \
|
||||
"[ERROR]" "EXIT signal received in docker container, exiting ..."
|
||||
"[DEBUG]" "EXIT signal received in docker container, exiting ..."
|
||||
set +x
|
||||
exec 17>&-
|
||||
hdfs dfs -put /tmp/pai_dockercontainer_$PAI_CONTAINER_ID.log \
|
||||
|
@ -130,15 +130,15 @@ export PAI_CURRENT_CONTAINER_PORT=$PAI_CONTAINER_HOST_PORT
|
|||
|
||||
function prepare_ssh()
|
||||
{
|
||||
mkdir /root/.ssh
|
||||
sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
|
||||
mkdir /root/.ssh
|
||||
sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
|
||||
}
|
||||
|
||||
function start_ssh_service()
|
||||
{
|
||||
printf "%s %s\n" \
|
||||
"[INFO]" "start ssh service"
|
||||
"[INFO]" "start ssh service"
|
||||
cat /root/.ssh/$APP_ID.pub >> /root/.ssh/authorized_keys
|
||||
sed -i 's/Port.*/Port '$PAI_CONTAINER_SSH_PORT'/' /etc/ssh/sshd_config
|
||||
echo "sshd:ALL" >> /etc/hosts.allow
|
||||
|
@ -147,18 +147,18 @@ function start_ssh_service()
|
|||
|
||||
function hdfs_upload_atomically()
|
||||
{
|
||||
printf "%s %s\n%s %s\n%s %s\n" \
|
||||
printf "%s %s\n%s %s\n%s %s\n" \
|
||||
"[INFO]" "upload ssh key to hdfs" \
|
||||
"[INFO]" "destination path is ${2}" \
|
||||
"[INFO]" "source path is ${1}"
|
||||
tempFolder=${2}"_temp"
|
||||
if hdfs dfs -test -d $tempFolder ; then
|
||||
printf "%s %s\n" \
|
||||
"[WARNING]" "$tempFolder already exists, overwriting..."
|
||||
hdfs dfs -rm -r $tempFolder || exit 1
|
||||
fi
|
||||
hdfs dfs -put ${1} $tempFolder || exit 1
|
||||
hdfs dfs -mv $tempFolder ${2} || exit 1
|
||||
tempFolder=${2}"_temp"
|
||||
if hdfs dfs -test -d $tempFolder ; then
|
||||
printf "%s %s\n" \
|
||||
"[WARNING]" "$tempFolder already exists, overwriting..."
|
||||
hdfs dfs -rm -r $tempFolder || exit 1
|
||||
fi
|
||||
hdfs dfs -put ${1} $tempFolder || exit 1
|
||||
hdfs dfs -mv $tempFolder ${2} || exit 1
|
||||
}
|
||||
|
||||
# Start sshd in docker container
|
||||
|
@ -183,18 +183,53 @@ else
|
|||
sleep 10
|
||||
done
|
||||
printf "%s %s\n%s %s\n" \
|
||||
"[INFO]" "ssh key pair ready ..." \
|
||||
"[INFO]" "begin to download ssh key pair from hdfs ..."
|
||||
"[INFO]" "ssh key pair ready ..." \
|
||||
"[INFO]" "begin to download ssh key pair from hdfs ..."
|
||||
hdfs dfs -get "${hdfs_ssh_folder}/.ssh/" "/root/" || exit 1
|
||||
fi
|
||||
# Start ssh service
|
||||
start_ssh_service
|
||||
chmod 400 ~/.ssh/$APP_ID
|
||||
# Generate ssh connect info file in "PAI_CONTAINER_ID-PAI_CURRENT_CONTAINER_IP-PAI_CONTAINER_SSH_PORT" format on hdfs
|
||||
hdfs dfs -touchz ${hdfs_ssh_folder}/$PAI_CONTAINER_ID-$PAI_CONTAINER_HOST_IP-$PAI_CONTAINER_SSH_PORT || exit 1
|
||||
|
||||
# Write env to bashrc
|
||||
env | sed "s/^/export /" >> ~/.bashrc
|
||||
# Generate ssh config
|
||||
ssh_config_path=${HDFS_LAUNCHER_PREFIX}/${PAI_USER_NAME}/${PAI_JOB_NAME}/ssh/config
|
||||
hdfs dfs -mkdir -p ${ssh_config_path} || exit 1
|
||||
hdfs dfs -touchz ${ssh_config_path}/$APP_ID+$PAI_CURRENT_TASK_ROLE_NAME+$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX+$PAI_CONTAINER_HOST_IP+$PAI_CONTAINER_SSH_PORT || exit 1
|
||||
while [ `hdfs dfs -ls $ssh_config_path | grep "/$PAI_JOB_NAME/ssh/config/$APP_ID+" | wc -l` -lt $PAI_JOB_TASK_COUNT ]; do
|
||||
printf "%s %s\n" "[INFO]" "Waiting for ssh service in other containers ..."
|
||||
sleep 10
|
||||
done
|
||||
NodeList=($(hdfs dfs -ls ${ssh_config_path} \
|
||||
| grep "/$PAI_JOB_NAME/ssh/config/$APP_ID+" \
|
||||
| grep -oE "[^/]+$" \
|
||||
| sed -e "s/^$APP_ID+//g" \
|
||||
| sort -n))
|
||||
if [ "${#NodeList[@]}" -ne $PAI_JOB_TASK_COUNT ]; then
|
||||
printf "%s %s\n%s\n%s\n\n" \
|
||||
"[ERROR]" "NodeList" \
|
||||
"${NodeList[@]}" \
|
||||
"ssh services in ${#NodeList[@]} containers are available, not equal to $PAI_JOB_TASK_COUNT, exit ..."
|
||||
exit 2
|
||||
fi
|
||||
for line in "${NodeList[@]}"; do
|
||||
node=(${line//+/ });
|
||||
printf "%s\n %s\n %s\n %s\n %s\n %s\n %s\n" \
|
||||
"Host ${node[0]}-${node[1]}" \
|
||||
"HostName ${node[2]}" \
|
||||
"Port ${node[3]}" \
|
||||
"User root" \
|
||||
"StrictHostKeyChecking no" \
|
||||
"UserKnownHostsFile /dev/null" \
|
||||
"IdentityFile /root/.ssh/$APP_ID" >> /root/.ssh/config
|
||||
done
|
||||
|
||||
# Start ssh service
|
||||
start_ssh_service
|
||||
|
||||
# Write env to system-wide environment
|
||||
env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment
|
||||
|
||||
sleep 10
|
||||
printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
|
||||
{{{ taskData.command }}} || exit $?
|
||||
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"
|
||||
|
|
Загрузка…
Ссылка в новой задаче