From 603a413d128e3a2e5e467a49cb493206c3e113fd Mon Sep 17 00:00:00 2001 From: Jacob Freck Date: Tue, 15 May 2018 17:47:41 -0700 Subject: [PATCH] Feature: nvBLAS and OpenBLAS plugin (#539) * add openblas plugin, update gpu docker images with netlib-lgpl * update images and plugins * add nvblas plugin * revert gpu docker image change, add -Pnetlib-lgpl to base images * change configuraitons to functions, add pugins to cluster.yaml --- .../models/plugins/internal/plugin_manager.py | 2 + aztk/spark/models/plugins/__init__.py | 2 + aztk/spark/models/plugins/nvblas/__init__.py | 1 + .../models/plugins/nvblas/configuration.py | 18 +++++ aztk/spark/models/plugins/nvblas/nvblas.sh | 65 +++++++++++++++++++ .../spark/models/plugins/openblas/__init__.py | 1 + .../models/plugins/openblas/configuration.py | 18 +++++ .../spark/models/plugins/openblas/openblas.sh | 4 ++ aztk_cli/config/cluster.yaml | 8 ++- docker-image/base/spark1.6.3/Dockerfile | 2 +- docker-image/base/spark2.1.0/Dockerfile | 2 +- docker-image/base/spark2.2.0/Dockerfile | 2 +- docker-image/base/spark2.3.0/Dockerfile | 2 +- docker-image/gpu/spark1.6.3/Dockerfile | 20 +++--- docker-image/gpu/spark2.1.0/Dockerfile | 20 +++--- docker-image/gpu/spark2.2.0/Dockerfile | 20 +++--- docker-image/gpu/spark2.3.0/Dockerfile | 20 +++--- 17 files changed, 164 insertions(+), 43 deletions(-) create mode 100644 aztk/spark/models/plugins/nvblas/__init__.py create mode 100644 aztk/spark/models/plugins/nvblas/configuration.py create mode 100644 aztk/spark/models/plugins/nvblas/nvblas.sh create mode 100644 aztk/spark/models/plugins/openblas/__init__.py create mode 100644 aztk/spark/models/plugins/openblas/configuration.py create mode 100644 aztk/spark/models/plugins/openblas/openblas.sh diff --git a/aztk/models/plugins/internal/plugin_manager.py b/aztk/models/plugins/internal/plugin_manager.py index f014ab19..8789dab1 100644 --- a/aztk/models/plugins/internal/plugin_manager.py +++ b/aztk/models/plugins/internal/plugin_manager.py @@ -23,6 +23,8 @@ class PluginManager: hdfs=plugins.HDFSPlugin, simple=plugins.SimplePlugin, spark_ui_proxy=plugins.SparkUIProxyPlugin, + openblas=plugins.OpenBLASPlugin, + nvblas=plugins.NvBLASPlugin, ) def __init__(self): diff --git a/aztk/spark/models/plugins/__init__.py b/aztk/spark/models/plugins/__init__.py index b574985c..a67db47f 100644 --- a/aztk/spark/models/plugins/__init__.py +++ b/aztk/spark/models/plugins/__init__.py @@ -5,3 +5,5 @@ from .rstudio_server import RStudioServerPlugin from .simple import SimplePlugin from .spark_ui_proxy import SparkUIProxyPlugin +from .openblas import OpenBLASPlugin +from .nvblas import NvBLASPlugin diff --git a/aztk/spark/models/plugins/nvblas/__init__.py b/aztk/spark/models/plugins/nvblas/__init__.py new file mode 100644 index 00000000..2ec26f31 --- /dev/null +++ b/aztk/spark/models/plugins/nvblas/__init__.py @@ -0,0 +1 @@ +from .configuration import * diff --git a/aztk/spark/models/plugins/nvblas/configuration.py b/aztk/spark/models/plugins/nvblas/configuration.py new file mode 100644 index 00000000..40af5003 --- /dev/null +++ b/aztk/spark/models/plugins/nvblas/configuration.py @@ -0,0 +1,18 @@ +import os +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_file import PluginFile +from aztk.utils import constants + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def NvBLASPlugin(): + return PluginConfiguration( + name="nvblas", + ports=[], + target_role=PluginTargetRole.All, + execute="nvblas.sh", + files=[ + PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh")), + ] + ) diff --git a/aztk/spark/models/plugins/nvblas/nvblas.sh b/aztk/spark/models/plugins/nvblas/nvblas.sh new file mode 100644 index 00000000..c4f4f908 --- /dev/null +++ b/aztk/spark/models/plugins/nvblas/nvblas.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +apt-get update && +apt-get install -y libblas-dev liblapack-dev && +update-alternatives --config libblas.so.3 +update-alternatives --config liblapack.so.3 + +export NVBLAS_CONFIG_FILE=/usr/local/cuda/lib64/nvblas.conf +echo "export NVBLAS_CONFIG_FILE=/usr/local/cuda/lib64/nvblas.conf" >> ~/.bashrc + +echo '# This is the configuration file to use NVBLAS Library +# Setup the environment variable NVBLAS_CONFIG_FILE to specify your own config file. +# By default, if NVBLAS_CONFIG_FILE is not defined, +# NVBLAS Library will try to open the file "nvblas.conf" in its current directory +# Example : NVBLAS_CONFIG_FILE /home/cuda_user/my_nvblas.conf +# The config file should have restricted write permissions accesses + +# Specify which output log file (default is stderr) +NVBLAS_LOGFILE /root/nvblas.log + +# Enable trace log of every intercepted BLAS calls +NVBLAS_TRACE_LOG_ENABLED + +#Put here the CPU BLAS fallback Library of your choice +#It is strongly advised to use full path to describe the location of the CPU Library +NVBLAS_CPU_BLAS_LIB /usr/lib/libblas.so + +# List of GPU devices Id to participate to the computation +# Use ALL if you want all your GPUs to contribute +# Use ALL0, if you want all your GPUs of the same type as device 0 to contribute +# However, NVBLAS consider that all GPU have the same performance and PCI bandwidth +# By default if no GPU are listed, only device 0 will be used + +#NVBLAS_GPU_LIST 0 2 4 +#NVBLAS_GPU_LIST ALL +NVBLAS_GPU_LIST ALL0 + +# Tile Dimension +NVBLAS_TILE_DIM 2048 + +# Autopin Memory +NVBLAS_AUTOPIN_MEM_ENABLED + +#List of BLAS routines that are prevented from running on GPU (use for debugging purpose +# The current list of BLAS routines supported by NVBLAS are +# GEMM, SYRK, HERK, TRSM, TRMM, SYMM, HEMM, SYR2K, HER2K + +#NVBLAS_GPU_DISABLED_SGEMM +#NVBLAS_GPU_DISABLED_DGEMM +#NVBLAS_GPU_DISABLED_CGEMM +#NVBLAS_GPU_DISABLED_ZGEMM + +# Computation can be optionally hybridized between CPU and GPU +# By default, GPU-supported BLAS routines are ran fully on GPU +# The option NVBLAS_CPU_RATIO_ give the ratio [0,1] +# of the amount of computation that should be done on CPU +# CAUTION : this option should be used wisely because it can actually +# significantly reduced the overall performance if too much work is given to CPU + +#NVBLAS_CPU_RATIO_CGEMM 0.07' > $NVBLAS_CONFIG_FILE + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/libblas:/usr/local/cuda/lib64 +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/libblas:/usr/local/cuda/lib64" >> ~/.bashrc +export LD_PRELOAD=/usr/local/cuda/lib64/libnvblas.so +echo "export LD_PRELOAD=/usr/local/cuda/lib64/libnvblas.so" >> ~/.bashrc diff --git a/aztk/spark/models/plugins/openblas/__init__.py b/aztk/spark/models/plugins/openblas/__init__.py new file mode 100644 index 00000000..2ec26f31 --- /dev/null +++ b/aztk/spark/models/plugins/openblas/__init__.py @@ -0,0 +1 @@ +from .configuration import * diff --git a/aztk/spark/models/plugins/openblas/configuration.py b/aztk/spark/models/plugins/openblas/configuration.py new file mode 100644 index 00000000..c83c96be --- /dev/null +++ b/aztk/spark/models/plugins/openblas/configuration.py @@ -0,0 +1,18 @@ +import os +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_file import PluginFile +from aztk.utils import constants + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def OpenBLASPlugin(): + return PluginConfiguration( + name="openblas", + ports=[], + target_role=PluginTargetRole.All, + execute="openblas.sh", + files=[ + PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh")), + ], + ) diff --git a/aztk/spark/models/plugins/openblas/openblas.sh b/aztk/spark/models/plugins/openblas/openblas.sh new file mode 100644 index 00000000..2b23840f --- /dev/null +++ b/aztk/spark/models/plugins/openblas/openblas.sh @@ -0,0 +1,4 @@ +#!/bin/bash +apt-get update && +apt-get install -y libopenblas-base && +update-alternatives --config libblas.so.3 diff --git a/aztk_cli/config/cluster.yaml b/aztk_cli/config/cluster.yaml index 0c1432a1..929f3688 100644 --- a/aztk_cli/config/cluster.yaml +++ b/aztk_cli/config/cluster.yaml @@ -5,7 +5,7 @@ # Toolkit configuration [Required] You can use `aztk toolkit` command to find which are the available tookits toolkit: software: spark - version: 2.2.0 + version: 2.3.0 # Which environemnt is needed for spark anaconda, r, miniconda environment: {environment} # Optional version for the environment @@ -16,7 +16,7 @@ toolkit: # vm_size: -vm_size: standard_a2 +vm_size: standard_f2 # size: size: 2 @@ -39,11 +39,13 @@ username: spark # Enable plugins plugins: - # - name: spark_ui_proxy # - name: jupyterlab # - name: jupyter # - name: hdfs # - name: rstudio_server + # - name: spark_ui_proxy + # - name: openblas + # - name: nvblas # Allow master node to also be a worker (Default: true) # worker_on_master: true diff --git a/docker-image/base/spark1.6.3/Dockerfile b/docker-image/base/spark1.6.3/Dockerfile index 9f92c4c1..53cdff0a 100644 --- a/docker-image/base/spark1.6.3/Dockerfile +++ b/docker-image/base/spark1.6.3/Dockerfile @@ -58,7 +58,7 @@ RUN apt-get clean \ && cd spark \ && git checkout tags/v${SPARK_VERSION_KEY} \ && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ - && ./make-distribution.sh --name custom-spark --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -Phadoop-2.6 -DskipTests \ + && ./make-distribution.sh --name custom-spark --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -Phadoop-2.6 -DskipTests \ && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ && rm -rf /spark \ diff --git a/docker-image/base/spark2.1.0/Dockerfile b/docker-image/base/spark2.1.0/Dockerfile index 9c93dcbc..1d20fc9c 100644 --- a/docker-image/base/spark2.1.0/Dockerfile +++ b/docker-image/base/spark2.1.0/Dockerfile @@ -58,7 +58,7 @@ RUN apt-get clean \ && cd spark \ && git checkout tags/v${SPARK_VERSION_KEY} \ && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ - && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ + && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ && rm -rf /spark \ diff --git a/docker-image/base/spark2.2.0/Dockerfile b/docker-image/base/spark2.2.0/Dockerfile index 56c397ac..eb8bbd24 100644 --- a/docker-image/base/spark2.2.0/Dockerfile +++ b/docker-image/base/spark2.2.0/Dockerfile @@ -57,7 +57,7 @@ RUN apt-get clean \ && cd spark \ && git checkout tags/v${SPARK_VERSION_KEY} \ && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ - && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ + && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ && rm -rf /spark \ diff --git a/docker-image/base/spark2.3.0/Dockerfile b/docker-image/base/spark2.3.0/Dockerfile index 9dd72d92..6b695521 100644 --- a/docker-image/base/spark2.3.0/Dockerfile +++ b/docker-image/base/spark2.3.0/Dockerfile @@ -58,7 +58,7 @@ RUN apt-get clean \ && cd spark \ && git checkout tags/v${SPARK_VERSION_KEY} \ && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ - && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ + && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ && rm -rf /spark \ diff --git a/docker-image/gpu/spark1.6.3/Dockerfile b/docker-image/gpu/spark1.6.3/Dockerfile index 6fb73e39..540df37e 100644 --- a/docker-image/gpu/spark1.6.3/Dockerfile +++ b/docker-image/gpu/spark1.6.3/Dockerfile @@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark1.6.3-base LABEL com.nvidia.volumes.needed="nvidia_driver" -RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list # CUDA ENV CUDA_VERSION 8.0.61 @@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH} +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" # cuDNN -RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list ENV CUDNN_VERSION 6.0.21 @@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/ ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/ -# # Tensorflow -# RUN pip install --upgrade tensorflow-gpu - -WORKDIR $SPARK_HOME -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] +CMD ["/bin/bash"] diff --git a/docker-image/gpu/spark2.1.0/Dockerfile b/docker-image/gpu/spark2.1.0/Dockerfile index a641fe89..3a27135e 100644 --- a/docker-image/gpu/spark2.1.0/Dockerfile +++ b/docker-image/gpu/spark2.1.0/Dockerfile @@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.1.0-base LABEL com.nvidia.volumes.needed="nvidia_driver" -RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list # CUDA ENV CUDA_VERSION 8.0.61 @@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH} +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" # cuDNN -RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list ENV CUDNN_VERSION 6.0.21 @@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/ ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/ -# # Tensorflow -# RUN pip install --upgrade tensorflow-gpu - -WORKDIR $SPARK_HOME -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/docker-image/gpu/spark2.2.0/Dockerfile b/docker-image/gpu/spark2.2.0/Dockerfile index 2c8db231..bfa78ffb 100644 --- a/docker-image/gpu/spark2.2.0/Dockerfile +++ b/docker-image/gpu/spark2.2.0/Dockerfile @@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.2.0-base LABEL com.nvidia.volumes.needed="nvidia_driver" -RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list # CUDA ENV CUDA_VERSION 8.0.61 @@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH} +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" # cuDNN -RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list ENV CUDNN_VERSION 6.0.21 @@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/ ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/ -# # Tensorflow -# RUN pip install --upgrade tensorflow-gpu - -WORKDIR $SPARK_HOME -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] +CMD ["/bin/bash"] diff --git a/docker-image/gpu/spark2.3.0/Dockerfile b/docker-image/gpu/spark2.3.0/Dockerfile index 3852404c..937a5d87 100644 --- a/docker-image/gpu/spark2.3.0/Dockerfile +++ b/docker-image/gpu/spark2.3.0/Dockerfile @@ -2,12 +2,14 @@ FROM aztk/spark:v0.1.0-spark2.3.0-base LABEL com.nvidia.volumes.needed="nvidia_driver" -RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list # CUDA ENV CUDA_VERSION 8.0.61 @@ -45,9 +47,13 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH} +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" # cuDNN -RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list ENV CUDNN_VERSION 6.0.21 @@ -72,8 +78,4 @@ ENV NUMBAPRO_LIBDEVICE /usr/local/cuda/nvvm/libdevice/ ENV NUMBAPRO_NVVM /usr/local/cuda-8.0/nvvm/lib64/libnvvm.so ENV NUMBAPRO_CUDALIB /usr/local/cuda-8.0/targets/x86_64-linux/lib/ -# # Tensorflow -# RUN pip install --upgrade tensorflow-gpu - -WORKDIR $SPARK_HOME -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] +CMD ["/bin/bash"]