Feature: nvBLAS and OpenBLAS plugin (#539)

* add openblas plugin, update gpu docker images with netlib-lgpl

* update images and plugins

* add nvblas plugin

* revert gpu docker image change, add -Pnetlib-lgpl to base images

* change configuraitons to functions, add pugins to cluster.yaml
This commit is contained in:
Jacob Freck 2018-05-15 17:47:41 -07:00 коммит произвёл GitHub
Родитель 94e551c8c3
Коммит 603a413d12
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
17 изменённых файлов: 164 добавлений и 43 удалений

Просмотреть файл

@ -23,6 +23,8 @@ class PluginManager:
hdfs=plugins.HDFSPlugin,
simple=plugins.SimplePlugin,
spark_ui_proxy=plugins.SparkUIProxyPlugin,
openblas=plugins.OpenBLASPlugin,
nvblas=plugins.NvBLASPlugin,
)
def __init__(self):

Просмотреть файл

@ -5,3 +5,5 @@ from .resource_monitor import ResourceMonitorPlugin
from .rstudio_server import RStudioServerPlugin
from .simple import SimplePlugin
from .spark_ui_proxy import SparkUIProxyPlugin
from .openblas import OpenBLASPlugin
from .nvblas import NvBLASPlugin

Просмотреть файл

@ -0,0 +1 @@
from .configuration import *

Просмотреть файл

@ -0,0 +1,18 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
def NvBLASPlugin():
return PluginConfiguration(
name="nvblas",
ports=[],
target_role=PluginTargetRole.All,
execute="nvblas.sh",
files=[
PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh")),
]
)

Просмотреть файл

@ -0,0 +1,65 @@
#!/bin/bash
apt-get update &&
apt-get install -y libblas-dev liblapack-dev &&
update-alternatives --config libblas.so.3
update-alternatives --config liblapack.so.3
export NVBLAS_CONFIG_FILE=/usr/local/cuda/lib64/nvblas.conf
echo "export NVBLAS_CONFIG_FILE=/usr/local/cuda/lib64/nvblas.conf" >> ~/.bashrc
echo '# This is the configuration file to use NVBLAS Library
# Setup the environment variable NVBLAS_CONFIG_FILE to specify your own config file.
# By default, if NVBLAS_CONFIG_FILE is not defined,
# NVBLAS Library will try to open the file "nvblas.conf" in its current directory
# Example : NVBLAS_CONFIG_FILE /home/cuda_user/my_nvblas.conf
# The config file should have restricted write permissions accesses
# Specify which output log file (default is stderr)
NVBLAS_LOGFILE /root/nvblas.log
# Enable trace log of every intercepted BLAS calls
NVBLAS_TRACE_LOG_ENABLED
#Put here the CPU BLAS fallback Library of your choice
#It is strongly advised to use full path to describe the location of the CPU Library
NVBLAS_CPU_BLAS_LIB /usr/lib/libblas.so
# List of GPU devices Id to participate to the computation
# Use ALL if you want all your GPUs to contribute
# Use ALL0, if you want all your GPUs of the same type as device 0 to contribute
# However, NVBLAS consider that all GPU have the same performance and PCI bandwidth
# By default if no GPU are listed, only device 0 will be used
#NVBLAS_GPU_LIST 0 2 4
#NVBLAS_GPU_LIST ALL
NVBLAS_GPU_LIST ALL0
# Tile Dimension
NVBLAS_TILE_DIM 2048
# Autopin Memory
NVBLAS_AUTOPIN_MEM_ENABLED
#List of BLAS routines that are prevented from running on GPU (use for debugging purpose
# The current list of BLAS routines supported by NVBLAS are
# GEMM, SYRK, HERK, TRSM, TRMM, SYMM, HEMM, SYR2K, HER2K
#NVBLAS_GPU_DISABLED_SGEMM
#NVBLAS_GPU_DISABLED_DGEMM
#NVBLAS_GPU_DISABLED_CGEMM
#NVBLAS_GPU_DISABLED_ZGEMM
# Computation can be optionally hybridized between CPU and GPU
# By default, GPU-supported BLAS routines are ran fully on GPU
# The option NVBLAS_CPU_RATIO_<BLAS_ROUTINE> give the ratio [0,1]
# of the amount of computation that should be done on CPU
# CAUTION : this option should be used wisely because it can actually
# significantly reduced the overall performance if too much work is given to CPU
#NVBLAS_CPU_RATIO_CGEMM 0.07' > $NVBLAS_CONFIG_FILE
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/libblas:/usr/local/cuda/lib64
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/libblas:/usr/local/cuda/lib64" >> ~/.bashrc
export LD_PRELOAD=/usr/local/cuda/lib64/libnvblas.so
echo "export LD_PRELOAD=/usr/local/cuda/lib64/libnvblas.so" >> ~/.bashrc

Просмотреть файл

@ -0,0 +1 @@
from .configuration import *

Просмотреть файл

@ -0,0 +1,18 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
def OpenBLASPlugin():
return PluginConfiguration(
name="openblas",
ports=[],
target_role=PluginTargetRole.All,
execute="openblas.sh",
files=[
PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh")),
],
)

Просмотреть файл

@ -0,0 +1,4 @@
#!/bin/bash
apt-get update &&
apt-get install -y libopenblas-base &&
update-alternatives --config libblas.so.3

Просмотреть файл

@ -5,7 +5,7 @@
# Toolkit configuration [Required] You can use `aztk toolkit` command to find which are the available tookits
toolkit:
software: spark
version: 2.2.0
version: 2.3.0
# Which environemnt is needed for spark anaconda, r, miniconda
environment: {environment}
# Optional version for the environment
@ -16,7 +16,7 @@ toolkit:
# vm_size: <vm-size, see available options here: https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/>
vm_size: standard_a2
vm_size: standard_f2
# size: <number of dedicated nodes in the cluster, not that clusters must contain all dedicated or all low priority nodes>
size: 2
@ -39,11 +39,13 @@ username: spark
# Enable plugins
plugins:
# - name: spark_ui_proxy
# - name: jupyterlab
# - name: jupyter
# - name: hdfs
# - name: rstudio_server
# - name: spark_ui_proxy
# - name: openblas
# - name: nvblas
# Allow master node to also be a worker <true/false> (Default: true)
# worker_on_master: true

Просмотреть файл

@ -58,7 +58,7 @@ RUN apt-get clean \
&& cd spark \
&& git checkout tags/v${SPARK_VERSION_KEY} \
&& export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \
&& ./make-distribution.sh --name custom-spark --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -Phadoop-2.6 -DskipTests \
&& ./make-distribution.sh --name custom-spark --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -Phadoop-2.6 -DskipTests \
&& tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \
&& ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \
&& rm -rf /spark \

Просмотреть файл

@ -58,7 +58,7 @@ RUN apt-get clean \
&& cd spark \
&& git checkout tags/v${SPARK_VERSION_KEY} \
&& export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \
&& ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \
&& rm -rf /spark \

Просмотреть файл

@ -57,7 +57,7 @@ RUN apt-get clean \
&& cd spark \
&& git checkout tags/v${SPARK_VERSION_KEY} \
&& export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \
&& ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \
&& rm -rf /spark \

Просмотреть файл

@ -58,7 +58,7 @@ RUN apt-get clean \
&& cd spark \
&& git checkout tags/v${SPARK_VERSION_KEY} \
&& export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \
&& tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \
&& ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \
&& rm -rf /spark \

Просмотреть файл

@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark1.6.3-base
LABEL com.nvidia.volumes.needed="nvidia_driver"
RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
# CUDA
ENV CUDA_VERSION 8.0.61
@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
# cuDNN
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
ENV CUDNN_VERSION 6.0.21
@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/
ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so
ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/
# # Tensorflow
# RUN pip install --upgrade tensorflow-gpu
WORKDIR $SPARK_HOME
CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
CMD ["/bin/bash"]

Просмотреть файл

@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.1.0-base
LABEL com.nvidia.volumes.needed="nvidia_driver"
RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
# CUDA
ENV CUDA_VERSION 8.0.61
@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
# cuDNN
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
ENV CUDNN_VERSION 6.0.21
@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/
ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so
ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/
# # Tensorflow
# RUN pip install --upgrade tensorflow-gpu
WORKDIR $SPARK_HOME
CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
CMD ["/bin/bash"]

Просмотреть файл

@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.2.0-base
LABEL com.nvidia.volumes.needed="nvidia_driver"
RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
# CUDA
ENV CUDA_VERSION 8.0.61
@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
# cuDNN
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
ENV CUDNN_VERSION 6.0.21
@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/
ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so
ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/
# # Tensorflow
# RUN pip install --upgrade tensorflow-gpu
WORKDIR $SPARK_HOME
CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
CMD ["/bin/bash"]

Просмотреть файл

@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.3.0-base
LABEL com.nvidia.volumes.needed="nvidia_driver"
RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
# CUDA
ENV CUDA_VERSION 8.0.61
@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
# cuDNN
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
ENV CUDNN_VERSION 6.0.21
@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/
ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so
ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/
# # Tensorflow
# RUN pip install --upgrade tensorflow-gpu
WORKDIR $SPARK_HOME
CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
CMD ["/bin/bash"]