Add BLAS3 and LAPACK routines (#6538)

* Added linear algebra operators * more comments about style of wrapper interface * more appropriate fatal exit when lapack does not exist * more comments on row/col-major ordering * added config switch for lapack usage * switched lapack usage off by default
apache · Jun 13, 2017 · e852036 · e852036
1 parent c43c901
commit e852036
Show file tree

Hide file tree

Showing 19 changed files with 1,602 additions and 13 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -199,6 +199,16 @@ if(USE_OPENMP)
   endif()
 endif()
 
+if(USE_LAPACK)
+  add_definitions(-DMXNET_USE_LAPACK=1)
+else(USE_LAPACK)
+  # Workaround for Windows until using new Jenkinsfile. 
+  if(USE_BLAS STREQUAL "open")
+    add_definitions(-DMXNET_USE_LAPACK=1)
+  endif()
+endif()
+
+
 if(UNIX)
   find_library(RTLIB rt)
   if(RTLIB)

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -157,7 +157,7 @@ USE_CPP_PACKAGE=1             \
           init_git_win()
           bat """mkdir build_vc14_cpu
 cd build_vc14_cpu
-cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
+cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
           bat 'C:\\mxnet\\build_vc14_cpu.bat'
 
           bat '''rmdir /s/q pkg_vc14_gpu
@@ -188,7 +188,7 @@ del /Q *.7z
              bat """mkdir build_vc14_gpu
 call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
 cd build_vc14_gpu
-cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
+cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
              bat 'C:\\mxnet\\build_vc14_gpu.bat'
              bat '''rmdir /s/q pkg_vc14_gpu
 mkdir pkg_vc14_gpu\\lib

diff --git a/Makefile b/Makefile
@@ -106,6 +106,16 @@ else
 endif
 endif
 
+# lapack settings.
+ifeq ($(USE_LAPACK), 1)
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),openblas apple atlas mkl))
+        CFLAGS += -DMXNET_USE_LAPACK
+endif
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),openblas atlas mkl))
+        LDFLAGS += -llapack
+endif
+endif
+
 ifeq ($(USE_CUDNN), 1)
 	CFLAGS += -DMSHADOW_USE_CUDNN=1
 	LDFLAGS += -lcudnn

diff --git a/appveyor.yml b/appveyor.yml
@@ -52,7 +52,7 @@ before_build:
 
         set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build
 
-        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
+        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
 
 build_script:
     - cmd: >-

diff --git a/docs/Dockerfile b/docs/Dockerfile
@@ -5,7 +5,7 @@ MAINTAINER Mu Li <[email protected]>
 # First, build MXNet binaries (ref mxnet/docker/cpu/Dockerfile)
 #
 
-RUN apt-get update && apt-get install -y build-essential git libopenblas-dev libopencv-dev
+RUN apt-get update && apt-get install -y build-essential git libopenblas-dev liblapack-dev libopencv-dev
 RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
     cp make/config.mk . && \
     echo "USE_BLAS=openblas" >>config.mk && \

diff --git a/docs/api/python/symbol.md b/docs/api/python/symbol.md
@@ -393,6 +393,21 @@ Composite multiple symbols into a new one by an operator.
     argmin
 ```
 
+### Linear Algebra
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    linalg_gemm
+    linalg_gemm2
+    linalg_potrf
+    linalg_potri
+    linalg_trmm
+    linalg_trsm
+    linalg_sumlogdiag
+```
+
 ### Miscellaneous
 
 ```eval_rst

diff --git a/docs/get_started/install.md b/docs/get_started/install.md
@@ -209,9 +209,9 @@ $ sudo apt-get install -y build-essential git
 
 **Step 2** Install OpenBLAS.
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) library for accelerated numerical computations on CPU machine. There are several flavors of BLAS libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
+*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
 ```bash
-$ sudo apt-get install -y libopenblas-dev
+$ sudo apt-get install -y libopenblas-dev liblapack-dev
 ```
 
 **Step 3** Install OpenCV.
@@ -429,9 +429,9 @@ $ sudo apt-get install -y build-essential git
 ```
 **Step 2** Install OpenBLAS.
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) library for accelerated numerical computations. There are several flavors of BLAS libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
+*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
 ```bash
-$ sudo apt-get install -y libopenblas-dev
+$ sudo apt-get install -y libopenblas-dev liblapack-dev
 ```
 
 **Step 3** Install OpenCV.
@@ -751,9 +751,9 @@ $ sudo apt-get install -y build-essential git
 
 **Step 2** Install OpenBLAS.
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) library for accelerated numerical computations on CPU machine. There are several flavors of BLAS libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
+*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
 ```bash
-$ sudo apt-get install -y libopenblas-dev
+$ sudo apt-get install -y libopenblas-dev liblapack-dev
 ```
 
 **Step 3** Install OpenCV.
@@ -823,9 +823,9 @@ $ sudo apt-get install -y build-essential git
 ```
 **Step 2** Install OpenBLAS.
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) library for accelerated numerical computations. There are several flavors of BLAS libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
+*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
 ```bash
-$ sudo apt-get install -y libopenblas-dev
+$ sudo apt-get install -y libopenblas-dev liblapack-dev
 ```
 
 **Step 3** Install OpenCV.

diff --git a/include/mxnet/c_lapack_api.h b/include/mxnet/c_lapack_api.h
@@ -0,0 +1,91 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file c_lapack_api.h
+ * \brief Unified interface for LAPACK calls from within mxnet. 
+ *  Purpose is to hide the platform specific differences.
+ */
+#ifndef MXNET_C_LAPACK_API_H_
+#define MXNET_C_LAPACK_API_H_
+
+// Manually maintained list of LAPACK interfaces that can be used
+// within MXNET. Conventions:
+//    - Interfaces must be compliant with lapacke.h in terms of signature and
+//      naming conventions so wrapping a function "foo" which has the
+//      signature
+//         lapack_int LAPACKE_foo(int, char, lapack_int, float* , lapack_int)
+//      within lapacke.h should result in a wrapper with the following signature
+//         int MXNET_LAPACK_foo(int, char, int, float* , int)
+//      Note that function signatures in lapacke.h will always have as first
+//      argument the storage order (row/col-major). All wrappers have to support
+//      that argument. The underlying fortran functions will always assume a
+//      column-major layout. It is the responsibility of the wrapper function
+//      to handle the (usual) case that it is called with data in row-major
+//      format, either by doing appropriate transpositions explicitly or using
+//      transposition options of the underlying fortran function.
+//    - It is ok to assume that matrices are stored in contiguous memory
+//      (which removes the need to do special handling for lda/ldb parameters
+//      and enables us to save additional matrix transpositions around
+//      the fortran calls).
+//    - It is desired to add some basic checking in the C++-wrappers in order
+//      to catch simple mistakes when calling these wrappers.
+//    - Must support compilation without lapack-package but issue runtime error in this case.
+
+#include <dmlc/logging.h>
+
+extern "C" {
+  // Fortran signatures
+  #define MXNET_LAPACK_FSIGNATURE1(func, dtype) \
+    void func##_(char* uplo, int* n, dtype* a, int* lda, int *info);
+
+  MXNET_LAPACK_FSIGNATURE1(spotrf, float)
+  MXNET_LAPACK_FSIGNATURE1(dpotrf, double)
+  MXNET_LAPACK_FSIGNATURE1(spotri, float)
+  MXNET_LAPACK_FSIGNATURE1(dpotri, double)
+}
+
+#define MXNET_LAPACK_ROW_MAJOR 101
+#define MXNET_LAPACK_COL_MAJOR 102
+
+#define CHECK_LAPACK_CONTIGUOUS(a, b) \
+  CHECK_EQ(a, b) << "non contiguous memory for array in lapack call";
+
+#define CHECK_LAPACK_UPLO(a) \
+  CHECK(a == 'U' || a == 'L') << "neither L nor U specified as triangle in lapack call";
+
+inline char loup(char uplo, bool invert) { return invert ? (uplo == 'U' ? 'L' : 'U') : uplo; }
+
+#if MXNET_USE_LAPACK
+
+  #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda ) { \
+    CHECK_LAPACK_CONTIGUOUS(n, lda); \
+    CHECK_LAPACK_UPLO(uplo); \
+    char o(loup(uplo, (matrix_layout == MXNET_LAPACK_ROW_MAJOR))); \
+    int ret(0); \
+    func##_(&o, &n, a, &lda, &ret); \
+    return ret; \
+  }
+  MXNET_LAPACK_CWRAPPER1(spotrf, float)
+  MXNET_LAPACK_CWRAPPER1(dpotrf, double)
+  MXNET_LAPACK_CWRAPPER1(spotri, float)
+  MXNET_LAPACK_CWRAPPER1(dpotri, double)
+
+#else
+  // use pragma message instead of warning
+  #pragma message("Warning: lapack usage not enabled, linalg-operators will be not available." \
+                  " Build with USE_LAPACK=1 to get lapack functionalities.")
+
+  // Define compilable stubs.
+  #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda ) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+  MXNET_LAPACK_CWRAPPER1(spotrf, float)
+  MXNET_LAPACK_CWRAPPER1(dpotrf, double)
+  MXNET_LAPACK_CWRAPPER1(spotri, float)
+  MXNET_LAPACK_CWRAPPER1(dpotri, double)
+
+#endif
+
+#endif  // MXNET_C_LAPACK_API_H_
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
@@ -291,6 +291,34 @@ class TBlob {
     return this->get_with_shape<Device, 3, DType>(
         this->shape_.FlatTo3D(axis_begin, axis_end), stream);
   }
+  /*!
+   * \brief flatten the tensor to specified number of dimensions,
+   *  collapse the highest dimensions or pad with higher dimensions
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam dim desired number of dimensions of returned tensor
+   * \tparam DType the type of elements in the tensor
+   * \return tensor after flatten
+   */
+  template<typename Device, int dim, typename DType>
+  inline mshadow::Tensor<Device, dim, DType> FlatToKD(
+     mshadow::Stream<Device> *stream = NULL) const {
+    mshadow::Shape<dim> shape;
+    shape[0] = 1;
+    // Pad higher dimensions in case dim > ndim()
+    for (int i = 0; i < dim - ndim(); ++i) {
+      shape[i] = 1;
+    }
+    // Collapse higher dimensions in case dim < ndim()
+    for (int i = 0; i < ndim() - dim + 1; ++i) {
+      shape[0] *= shape_[i];
+    }
+    // Preserve lower dimensions.
+    for (int i = std::max(0, ndim() - dim + 1); i < ndim(); ++i) {
+      shape[i - ndim() + dim] = shape_[i];
+    }
+    return this->get_with_shape<Device, dim, DType>(shape, stream);
+  }
 
  private:
   static DLDataType DTypeTransform(int type_flag) {

diff --git a/make/config.mk b/make/config.mk
@@ -65,6 +65,9 @@ USE_OPENCV = 1
 # use openmp for parallelization
 USE_OPENMP = 1
 
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 0
 
 # MKL ML Library for Intel CPU/Xeon Phi
 # Please refer to MKL_README.md for details

diff --git a/make/osx.mk b/make/osx.mk
@@ -62,6 +62,10 @@ USE_OPENCV = 1
 # use openmp for parallelization
 USE_OPENMP = 0
 
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 0
+
 # choose the version of blas you want to use
 # can be: mkl, blas, atlas, openblas
 USE_BLAS = apple

diff --git a/make/pip_linux_cpu.mk b/make/pip_linux_cpu.mk
@@ -29,6 +29,11 @@ ADD_CFLAGS += -Ldeps/lib -Ideps/include
 # matrix computation libraries for CPU/GPU
 #---------------------------------------------
 
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+# you can disable it, however, you will not be able to use linalg-operators
+USE_LAPACK = 0
+
 # choose the version of blas you want to use
 # can be: mkl, blas, atlas, openblas
 # in default use atlas for linux while apple for osx

diff --git a/make/readthedocs.mk b/make/readthedocs.mk
@@ -32,6 +32,8 @@ USE_OPENMP = 0
 # can be: mkl, blas, atlas, openblas
 USE_STATIC_MKL = NONE
 USE_BLAS = NONE
+USE_LAPACK = 0
+
 #
 # add path to intel library, you may need it
 # for MKL, if you did not add the path to environment variable

diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
@@ -96,6 +96,23 @@ struct ElemwiseGradUseOut {
   }
 };
 
+// Transfer gradient and input and output to FGradient function
+struct ElemwiseGradUseInOut {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    for (auto& h : n->inputs) {
+      heads.push_back(h);
+    }
+    index_t n_out = n->num_outputs();
+    for (index_t i = 0; i < n_out; ++i) {
+      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
 // Transfer only gradient to FGradient function
 struct ElemwiseGradUseNone {
   const char *op_name;