merge with master
This commit is contained in:
Коммит
94937857cf
|
@ -1,74 +1,6 @@
|
|||
# Ubuntu 16.04, CUDA 9.0
|
||||
FROM nvidia/cuda:9.0-runtime-ubuntu16.04
|
||||
FROM microsoft/cntk:2.5.1-gpu-python3.5-cuda9.0-cudnn7.0
|
||||
|
||||
ENV CNTK_VERSION=2.5.1
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV PYTHON_VERSION=3.5
|
||||
ENV PY_VERSION=35
|
||||
ENV NCCL_VERSION=2.1.15-1+cuda9.0
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
sudo \
|
||||
git \
|
||||
curl \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libopenmpi-dev \
|
||||
libcudnn7=$CUDNN_VERSION \
|
||||
libnccl2 \
|
||||
libnccl-dev \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
ssh \
|
||||
python$PYTHON_VERSION \
|
||||
python$PYTHON_VERSION-dev
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
|
||||
tar zxf openmpi-3.0.0.tar.gz && \
|
||||
cd openmpi-3.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
|
||||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
|
||||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
|
||||
chmod a+x /usr/local/bin/mpirun
|
||||
|
||||
# Configure OpenMPI to run good defaults:
|
||||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
|
||||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
|
||||
|
||||
# Set default NCCL parameters
|
||||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
|
||||
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
|
||||
|
||||
# Install CNTK
|
||||
RUN pip install --no-cache-dir https://cntk.ai/PythonWheel/GPU/cntk_gpu-$CNTK_VERSION-cp$PY_VERSION-cp${PY_VERSION}m-linux_x86_64.whl h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
|
||||
WORKDIR /root
|
||||
ENV PATH /cntk/cntk/bin:/root/anaconda3/envs/cntk-py35/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH /cntk/cntk/lib:/cntk/cntk/dependencies/lib:$LD_LIBRARY_PATH
|
||||
|
||||
|
||||
|
|
|
@ -1,46 +1,9 @@
|
|||
define PROJECT_HELP_MSG
|
||||
Usage:
|
||||
make help show this message
|
||||
make build make CNTK image with Open MPI
|
||||
make run-mpi run training using Open MPI image
|
||||
make push push CNTK image with Open MPI
|
||||
endef
|
||||
export PROJECT_HELP_MSG
|
||||
|
||||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
image-open:=hoaphumanoid/cntk:distributed
|
||||
open-path:=$(PWD)/Docker
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py
|
||||
include ../include/build.mk
|
||||
|
||||
setup_volumes:=-v $(PWD)/src/execution:/mnt/script \
|
||||
-v $(DATA_DIR):/mnt/input \
|
||||
-v $(DATA_DIR)/temp/model:/mnt/model \
|
||||
-v $(DATA_DIR)/temp/output:/mnt/output
|
||||
|
||||
|
||||
setup_environment:=--env AZ_BATCHAI_INPUT_TRAIN='/mnt/input' \
|
||||
--env AZ_BATCHAI_INPUT_TEST='/mnt/input' \
|
||||
--env AZ_BATCHAI_OUTPUT_MODEL='/mnt/model' \
|
||||
--env AZ_BATCHAI_JOB_TEMP_DIR='/mnt/output'
|
||||
|
||||
name_prefix:=hoaphumanoid
|
||||
|
||||
define execute_mpi
|
||||
nvidia-docker run -it \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
$(1) bash -c "mpirun -np 2 -H localhost:2 python /mnt/script/ImagenetEstimatorCNTK.py"
|
||||
endef
|
||||
|
||||
|
||||
help:
|
||||
echo "$$PROJECT_HELP_MSG" | less
|
||||
|
||||
build:
|
||||
docker build -t $(name_prefix)/cntk Docker
|
||||
|
||||
run-mpi:
|
||||
$(call execute_mpi, $(name_prefix)/cntk)
|
||||
|
||||
push:
|
||||
docker push $(name_prefix)/cntk
|
||||
|
||||
.PHONY: help build push
|
||||
|
|
|
@ -69,6 +69,13 @@ define execute
|
|||
$(1) bash -c "python $(2)"
|
||||
endef
|
||||
|
||||
define execute_jupyter
|
||||
nvidia-docker run -p 8888:8888 -it \
|
||||
--shm-size="8g" \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
$(1) bash -c "jupyter notebook --ip=* --no-browser --allow-root"
|
||||
endef
|
||||
|
||||
help:
|
||||
echo "$$PROJECT_HELP_MSG" | less
|
||||
|
@ -88,6 +95,9 @@ run-mpi-intel:
|
|||
run:
|
||||
$(call execute, $(image-open), $(script))
|
||||
|
||||
run-jupyter:
|
||||
$(call execute_jupyter, $(image-open))
|
||||
|
||||
push:
|
||||
docker push $(image-open)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче