From bcfa364ca773f0919033d0a49a5f5b6c336d5ab5 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 8 Apr 2020 21:59:50 +0000
Subject: [PATCH 01/14] Update edge toolchain

---
 CMakeLists.txt                                |   2 +-
 ci/build.py                                   |  11 +-
 ci/docker/Dockerfile.build.android_armv7      |  94 +++-----
 ci/docker/Dockerfile.build.android_armv8      |  92 +++-----
 ci/docker/Dockerfile.build.armv6              |  45 ++--
 ci/docker/Dockerfile.build.armv7              |  54 +++--
 ci/docker/Dockerfile.build.armv8              |  56 +++--
 ci/docker/Dockerfile.build.jetson             |  96 ++++----
 ci/docker/install/android_armv7_openblas.sh   |  31 ---
 ci/docker/install/android_ndk.sh              |  38 ---
 ci/docker/install/arm64_openblas.sh           |  35 ---
 ci/docker/install/ubuntu_arm.sh               |  28 ---
 ci/docker/runtime_functions.sh                |  86 ++++---
 .../aarch64-linux-gnu-toolchain.cmake}        |  22 +-
 .../arm-linux-gnueabihf-toolchain.cmake}      |  23 +-
 cmake/upstream/FindCUDAToolkit.cmake          | 205 ++++++++++++-----
 make/crosscompile.jetson.mk                   | 216 ------------------
 src/operator/random/shuffle_op.cc             |   6 +-
 18 files changed, 435 insertions(+), 705 deletions(-)
 delete mode 100755 ci/docker/install/android_armv7_openblas.sh
 delete mode 100755 ci/docker/install/android_ndk.sh
 delete mode 100755 ci/docker/install/arm64_openblas.sh
 delete mode 100755 ci/docker/install/ubuntu_arm.sh
 rename ci/docker/{install/arm_openblas.sh => toolchains/aarch64-linux-gnu-toolchain.cmake} (64%)
 mode change 100755 => 100644
 rename ci/docker/{install/android_arm64_openblas.sh => toolchains/arm-linux-gnueabihf-toolchain.cmake} (65%)
 mode change 100755 => 100644
 delete mode 100644 make/crosscompile.jetson.mk

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 437d01668246..e630730115a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,7 +202,7 @@ endif(MSVC)
 
 if(NOT mxnet_LINKER_LIBS)
   set(mxnet_LINKER_LIBS "")
-endif(NOT mxnet_LINKER_LIBS)
+endif()
 
 if(USE_GPROF)
   message(STATUS "Using GPROF")
diff --git a/ci/build.py b/ci/build.py
index a21ec44942a8..cbc41218f042 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -70,7 +70,8 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool) -> str:
+def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
+                 cache_intermediate: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
@@ -104,6 +105,8 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
            "--build-arg", "GROUP_ID={}".format(os.getgid())]
     if no_cache:
         cmd.append("--no-cache")
+    if cache_intermediate:
+        cmd.append("--rm=false")
     elif registry:
         cmd.extend(["--cache-from", tag])
     cmd.extend(["-t", tag, get_dockerfiles_path()])
@@ -330,6 +333,9 @@ def main() -> int:
     parser.add_argument("--no-cache", action="store_true",
                         help="passes --no-cache to docker build")
 
+    parser.add_argument("--cache-intermediate", action="store_true",
+                        help="passes --rm=false to docker build")
+
     parser.add_argument("-e", "--environment", nargs="*", default=[],
                         help="Environment variables for the docker container. "
                         "Specify with a list containing either names or name=value")
@@ -361,7 +367,8 @@ def main() -> int:
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
         if not args.run_only:
             build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
-                         num_retries=args.docker_build_retries, no_cache=args.no_cache)
+                         num_retries=args.docker_build_retries, no_cache=args.no_cache,
+                         cache_intermediate=args.cache_intermediate)
         else:
             logging.info("Skipping docker build step.")
 
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 2c923a015b63..96ca04e9f5e6 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-# The cross-compiling emulator
-RUN apt-get update && apt-get install -y \
-  unzip
-
-ENV CROSS_TRIPLE=arm-linux-androideabi
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-WORKDIR /work/deps
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-WORKDIR /work
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-COPY install/android_armv7_openblas.sh /work/deps
-RUN /work/deps/android_armv7_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-WORKDIR /work
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 ARM_SOFTFP_ABI=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r19/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/armv7a-linux-androideabi16-clang AR=$TOOLCHAIN/bin/arm-linux-androideabi-ar && \
+    make PREFIX=/usr/local/openblas-android install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
-
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index ca62288129bb..81adc80edf14 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-RUN apt-get update && apt-get install -y \
-  unzip
-
-WORKDIR /work/deps
-
-# Build x86 dependencies.
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-# Setup Android cross-compilation environment.
-ENV CROSS_TRIPLE=aarch64-linux-android
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-ENV ARCH aarch64
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm64
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/
-RUN /work/android_ndk.sh
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-# Build ARM dependencies.
-COPY install/android_arm64_openblas.sh /work/
-RUN /work/android_arm64_openblas.sh
-ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r21/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang AR=$TOOLCHAIN/bin/aarch64-linux-android-ar && \
+    make PREFIX=/usr/local/openblas-android install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index e6a7ffe758b9..02e16da11616 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,25 +18,42 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM dockcross/linux-armv6
+FROM ubuntu:20.04
 
-ENV ARCH armv6l
-ENV HOSTCC gcc
-ENV TARGET ARMV6
+ENV ARCH=armv6l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV6
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+ && rm -rf /var/lib/apt/lists/*
 
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
+# We use a toolchain from toolchains.bootlin.com instead of Debian / Ubunut
+# crossbuild-essential-armel toolchain, as the latter targets ARM architecture
+# versions 4T, 5T, and 6, whereas we only wish to target ARMV6 and like to use
+# ARMV6 specific features. https://wiki.debian.org/ArmEabiPort
+RUN curl -o armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 -L https://toolchains.bootlin.com/downloads/releases/toolchains/armv6-eabihf/tarballs/armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    tar xf armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    rm armv6-eabihf--glibc--stable-2020.02-2.tar.bz2
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/share/buildroot/toolchainfile.cmake
 
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
+    make PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index bad9ab214050..a9cc6d1e83a4 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -16,27 +16,39 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to build MXNet for Android ARMv7
-
-FROM dockcross/linux-armv7
-
-ENV ARCH armv7l
-ENV HOSTCC gcc
-ENV TARGET ARMV7
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+# Dockerfile to build MXNet for ARMv7
+
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-armhf \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/arm-linux-gnueabihf-toolchain.cmake /usr/local
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/arm-linux-gnueabihf-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=arm-linux-gnueabihf-gcc && \
+    make PREFIX=/usr/local/arm-linux-gnueabihf install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index bd2373180f0b..adf6873fb40c 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -18,29 +18,37 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM dockcross/linux-arm64
-
-ENV ARCH aarch64
-ENV HOSTCC gcc
-ENV TARGET ARMV8
-
-WORKDIR /work/deps
-
-# gh issue #11567 https://github.com/apache/incubator-mxnet/issues/11567
-#RUN sed -i '\#deb http://cdn-fastly.deb.debian.org/debian-security jessie/updates main#d' /etc/apt/sources.list
-#RUN sed -i 's/cdn-fastly.//' /etc/apt/sources.list
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -48,4 +56,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index e31ee43a93d8..93fe5e0a5b0d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -20,68 +20,58 @@
 # This script assumes /work/mxnet exists and contains the mxnet code you wish to compile and
 # that /work/build exists and is the target for your output.
 
-FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
 
-FROM dockcross/linux-arm64
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    TARGET=ARMV8
 
-ENV ARCH aarch64
-ENV HOSTCC gcc
-ENV TARGET ARMV8
+WORKDIR /usr/local
 
-# gh issue #11567 https://github.com/apache/incubator-mxnet/issues/11567
-#RUN sed -i '\#deb http://cdn-fastly.deb.debian.org/debian-security jessie/updates main#d' /etc/apt/sources.list
-#RUN sed -i 's/cdn-fastly.//' /etc/apt/sources.list
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    git \
+    curl \
+    zip \
+    unzip \
+    python3 \
+    python3-pip \
+    awscli \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
 
+# cmake on Ubuntu 18.04 is too old
+RUN python3 -m pip install cmake
 
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
+# ccache on Ubuntu 18.04 is too old to support Cuda correctly
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-# Setup CUDA build env (including configuring and copying nvcc)
-COPY --from=cudabuilder /usr/local/cuda /usr/local/cuda
-ENV TARGET_ARCH aarch64
-ENV TARGET_OS linux
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
-# Install ARM depedencies based on Jetpack 3.3
-RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcenter/mobile/jetpack_l4t/3.3/lw.xd42/JetPackL4T_33_b39 && \
-    CUDA_REPO_PREFIX=/var/cuda-repo-9-0-local && \
-    ARM_CUDA_INSTALLER_PACKAGE=cuda-repo-l4t-9-0-local_9.0.252-1_arm64.deb && \
-    ARM_CUDNN_INSTALLER_PACKAGE=libcudnn7_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_CUDNN_DEV_INSTALLER_PACKAGE=libcudnn7-dev_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_LICENSE_INSTALLER=cuda-license-9-0_9.0.252-1_arm64.deb && \
-    ARM_CUBLAS_INSTALLER=cuda-cublas-9-0_9.0.252-1_arm64.deb && \
-    ARM_NVINFER_INSTALLER_PACKAGE=libnvinfer4_4.1.3-1+cuda9.0_arm64.deb && \
-    ARM_NVINFER_DEV_INSTALLER_PACKAGE=libnvinfer-dev_4.1.3-1+cuda9.0_arm64.deb && \
-    dpkg --add-architecture arm64 && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDA_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDA_INSTALLER_PACKAGE && \
-    apt-key add $CUDA_REPO_PREFIX/7fa2af80.pub && \
-    dpkg -i --force-architecture  $ARM_CUDNN_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_LICENSE_INSTALLER && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_CUBLAS_INSTALLER && \
-    dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
-RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
-ENV PATH $PATH:/usr/local/cuda/bin
-ENV NVCCFLAGS "-m64"
-ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
-ENV NVCC /usr/local/cuda/bin/nvcc
+# Install aarch64 cross depedencies based on Jetpack 4.3
+# Manually downloaded using SDK Manager tool and placed in a private S3 bucket.
+# We're not allowed to redistribute these files and there is no public version.
+RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb . && \
+    dpkg -i cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    rm cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    apt-key add /var/cuda-repo-10-0-local-10.0.326-410.108/7fa2af80.pub && \
+    aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb . && \
+    dpkg -i cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    rm cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    apt-get update && \
+    apt-get install -y -f && \
+    apt-get install -y cuda-cross-aarch64 cuda-cross-aarch64-10-0 && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/install/android_armv7_openblas.sh b/ci/docker/install/android_armv7_openblas.sh
deleted file mode 100755
index 55c098909654..000000000000
--- a/ci/docker/install/android_armv7_openblas.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make TARGET=ARMV7 HOSTCC=gcc NOFORTRAN=1 ARM_SOFTFP_ABI=1 -j$(nproc) libs
-#make PREFIX=${CROSS_ROOT} TARGET=ARMV7 HOSTCC=gcc NOFORTRAN=1 ARM_SOFTFP_ABI=1 install
-cp *.h ${CROSS_ROOT}/include
-cp libopenblas*.a ${CROSS_ROOT}/lib
-popd
diff --git a/ci/docker/install/android_ndk.sh b/ci/docker/install/android_ndk.sh
deleted file mode 100755
index cb83aa65639a..000000000000
--- a/ci/docker/install/android_ndk.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-# This environment variable comes from the docker file
-echo "Downloading android SDK rev ${ANDROID_NDK_REVISION}"
-curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-cd android-ndk-r${ANDROID_NDK_REVISION} && \
-./build/tools/make_standalone_toolchain.py \
-    --stl=libc++ \
-    --arch ${ANDROID_NDK_ARCH}\
-    --api ${ANDROID_NDK_API}\
-    --install-dir=${CROSS_ROOT} && \
-
-find ${CROSS_ROOT} -exec chmod a+r '{}' \; && \
-find ${CROSS_ROOT} -executable -exec chmod a+x '{}' \;
-popd
diff --git a/ci/docker/install/arm64_openblas.sh b/ci/docker/install/arm64_openblas.sh
deleted file mode 100755
index 88f2e98cd65b..000000000000
--- a/ci/docker/install/arm64_openblas.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-wget -nv https://api.github.com/repos/xianyi/OpenBLAS/git/refs/heads/master -O openblas_version.json
-echo "Using openblas:"
-cat openblas_version.json
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8
-make install
-ln -s /opt/OpenBLAS/lib/libopenblas.so /usr/lib/libopenblas.so
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/libopenblas.a
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/liblapack.a
-popd
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/ubuntu_arm.sh
deleted file mode 100755
index 608d0362f138..000000000000
--- a/ci/docker/install/ubuntu_arm.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-apt update || true
-apt install -y \
-    unzip \
-    python3 \
-    python3-pip
-
-pip3 install setuptools
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0c7630f24015..ae55c12fb5ac 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -248,15 +248,22 @@ build_dynamic_libmxnet() {
 
 build_jetson() {
     set -ex
-    pushd .
-
-    #build_ccache_wrappers
-
-    cp make/crosscompile.jetson.mk ./config.mk
-    make -j$(nproc)
-
-    build_wheel /work/mxnet/python /work/mxnet/lib
-    popd
+    cd /work/build
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="5.2" \
+        -DENABLE_CUDA_RTC=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -G Ninja /work/mxnet
+    ninja
+    build_wheel
 }
 
 #
@@ -286,7 +293,7 @@ build_armv6() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
+        -Dmxnet_LINKER_LIBS=-latomic \
         -G Ninja /work/mxnet
 
     ninja
@@ -316,7 +323,6 @@ build_armv7() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
 
     ninja
@@ -327,14 +333,15 @@ build_armv7() {
 build_armv8() {
     build_ccache_wrappers
     cmake \
-        -DUSE_CUDA=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_OPENCV=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
-        -DUSE_LAPACK=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=Release\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
     build_wheel
@@ -350,16 +357,18 @@ build_android_armv7() {
     cd /work/build
     build_ccache_wrappers
     cmake \
-        -DANDROID=ON\
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="armeabi-v7a" \
+        -DANDROID_STL="c++_shared" \
+        -DANDROID=ON \
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -367,17 +376,18 @@ build_android_armv7() {
 build_android_armv8() {
     set -ex
     cd /work/build
-    build_ccache_wrappers
-    cmake\
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="arm64-v8a" \
+        -DANDROID_STL="c++_shared" \
         -DANDROID=ON \
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
old mode 100755
new mode 100644
similarity index 64%
rename from ci/docker/install/arm_openblas.sh
rename to ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
index fa2e5cae9cba..3780415c4b15
--- a/ci/docker/install/arm_openblas.sh
+++ b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,14 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-
-git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
-
-cd OpenBLAS
-make -j$(nproc)
-PREFIX=${CROSS_ROOT} make install
-
-cd ..
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+set(CMAKE_CUDA_HOST_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_FIND_ROOT_PATH "/usr/aarch64-linux-gnu")
 
-rm -rf OpenBLAS
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/ci/docker/install/android_arm64_openblas.sh b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
old mode 100755
new mode 100644
similarity index 65%
rename from ci/docker/install/android_arm64_openblas.sh
rename to ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
index 1c3014f6cca9..62038ecee16a
--- a/ci/docker/install/android_arm64_openblas.sh
+++ b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,16 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "armv7l")
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+set(CMAKE_FIND_ROOT_PATH "/usr/arm-linux-gnueabihf" "/usr/local/arm-linux-gnueabihf")
 
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8 ARM_SOFTFP_ABI=1 HOSTCC=gcc NOFORTRAN=1 libs
-# Can't be run (utility not compiled for the target platform)
-#make install
-cp *.h /usr/include
-cp libopenblas.a /usr/local/lib
-popd
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/cmake/upstream/FindCUDAToolkit.cmake b/cmake/upstream/FindCUDAToolkit.cmake
index d37c44d9c782..fee4f3f4f698 100644
--- a/cmake/upstream/FindCUDAToolkit.cmake
+++ b/cmake/upstream/FindCUDAToolkit.cmake
@@ -132,6 +132,7 @@ of the following libraries that are part of the CUDAToolkit:
 - :ref:`cuRAND<cuda_toolkit_cuRAND>`
 - :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
 - :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
 - :ref:`NPP<cuda_toolkit_NPP>`
 - :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
 - :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
@@ -149,7 +150,6 @@ CUDA Runtime Library
 
 The CUDA Runtime library (cudart) are what most applications will typically
 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
-They are an explicit dependency of almost every library.
 
 Targets Created:
 
@@ -230,6 +230,18 @@ Targets Created:
 - ``CUDA::cusparse``
 - ``CUDA::cusparse_static``
 
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
 .. _`cuda_toolkit_NPP`:
 
 NPP
@@ -361,8 +373,6 @@ Targets Created:
 
 - ``CUDA::nvml``
 
-.. _`cuda_toolkit_opencl`:
-
 .. _`cuda_toolkit_nvToolsExt`:
 
 nvToolsExt
@@ -375,6 +385,8 @@ Targets Created:
 
 - ``CUDA::nvToolsExt``
 
+.. _`cuda_toolkit_opencl`:
+
 OpenCL
 """"""
 
@@ -436,6 +448,11 @@ Result variables
     The path to the CUDA Toolkit library directory that contains the CUDA
     Runtime library ``cudart``.
 
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalant to
+    ``CUDAToolkit_ROOT_DIR``.
+
 ``CUDAToolkit_NVCC_EXECUTABLE``
     The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
     **not** be the same as
@@ -487,6 +504,7 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR)
   get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
   # use the already detected cuda compiler
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -641,6 +659,7 @@ endif()
 if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
   get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -669,8 +688,47 @@ endif()
 
 get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
 
-# Now that we have the real ROOT_DIR, find components inside it.
-list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+# Handle cross compilation
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+else()
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
 
 # Find the include/ directory
 find_path(CUDAToolkit_INCLUDE_DIR
@@ -680,14 +738,17 @@ find_path(CUDAToolkit_INCLUDE_DIR
 # And find the CUDA Runtime Library libcudart
 find_library(CUDA_CUDART
   NAMES cudart
-  PATH_SUFFIXES lib64 lib/x64
+  PATH_SUFFIXES lib64 lib64/stubs lib/x64
 )
 if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
   message(STATUS "Unable to find cudart library.")
 endif()
 
 unset(CUDAToolkit_ROOT_DIR)
-list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
 
 #-----------------------------------------------------------------------------
 # Perform version comparison and validate all required variables are set.
@@ -702,6 +763,10 @@ find_package_handle_standard_args(CUDAToolkit
   VERSION_VAR
     CUDAToolkit_VERSION
 )
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 )
 
 #-----------------------------------------------------------------------------
 # Construct result variables
@@ -714,78 +779,103 @@ endif()
 # Construct import targets
 if(CUDAToolkit_FOUND)
 
-  function(find_and_add_cuda_import_lib lib_name)
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
 
-    if(ARGC GREATER 1)
-      set(search_names ${ARGN})
-    else()
-      set(search_names ${lib_name})
-    endif()
+    set(search_names ${lib_name} ${arg_ALT})
 
     find_library(CUDA_${lib_name}_LIBRARY
       NAMES ${search_names}
-      PATHS ${CUDAToolkit_LIBRARY_DIR}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
             ENV CUDA_PATH
-      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+      PATH_SUFFIXES nvidia/current lib64 lib64/stubs lib/x64 lib lib/stubs stubs
+                    ${arg_EXTRA_PATH_SUFFIXES}
     )
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
 
-    if (NOT CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
       add_library(CUDA::${lib_name} IMPORTED INTERFACE)
       target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
       target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
     endif()
   endfunction()
 
-  function(add_cuda_link_dependency lib_name)
-    foreach(dependency IN LISTS ${ARGN})
-      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
-    endforeach()
-  endfunction()
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
 
-  add_library(CUDA::toolkit IMPORTED INTERFACE)
-  target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
-  target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
 
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
 
-  find_and_add_cuda_import_lib(cuda_driver cuda)
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
 
-  find_and_add_cuda_import_lib(cudart)
-  find_and_add_cuda_import_lib(cudart_static)
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
 
-  foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    add_cuda_link_dependency(${cuda_lib} cudart)
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
 
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
+    if(UNIX AND NOT APPLE)
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
   endforeach()
 
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
+
   # cuSOLVER depends on cuBLAS, and cuSPARSE
-  add_cuda_link_dependency(cusolver cublas cusparse)
-  add_cuda_link_dependency(cusolver_static cublas_static cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
 
   # nvGRAPH depends on cuRAND, and cuSOLVER.
-  add_cuda_link_dependency(nvgraph curand cusolver)
-  add_cuda_link_dependency(nvgraph_static curand_static cusolver_static)
-
-  find_and_add_cuda_import_lib(nppc)
-  find_and_add_cuda_import_lib(nppc_static)
-
-  add_cuda_link_dependency(nppc cudart)
-  add_cuda_link_dependency(nppc_static cudart_static culibos)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
 
   # Process the majority of the NPP libraries.
   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib} nppc)
-    add_cuda_link_dependency(${cuda_lib}_static nppc_static)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
   endforeach()
 
-  find_and_add_cuda_import_lib(nvrtc)
-  add_cuda_link_dependency(nvrtc cuda_driver)
+  _CUDAToolkit_find_and_add_import_lib(cupti
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+  _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
 
-  find_and_add_cuda_import_lib(nvml nvidia-ml nvml)
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
 
   if(WIN32)
     # nvtools can be installed outside the CUDA toolkit directory
@@ -798,17 +888,12 @@ if(CUDAToolkit_FOUND)
       PATH_SUFFIXES lib/x64 lib
     )
   endif()
-  find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
 
-  add_cuda_link_dependency(nvToolsExt cudart)
-
-  find_and_add_cuda_import_lib(OpenCL)
-
-  find_and_add_cuda_import_lib(culibos)
-  if(TARGET CUDA::culibos)
-    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
-      add_cuda_link_dependency(${cuda_lib}_static culibos)
-    endforeach()
-  endif()
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
 
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
 endif()
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
deleted file mode 100644
index 880e2cf5b466..000000000000
--- a/make/crosscompile.jetson.mk
+++ /dev/null
@@ -1,216 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet
-#
-#  If you want to change the configuration, please use the following
-#  steps. Assume you are on the root directory of mxnet. First copy the this
-#  file so that any local changes will be ignored by git
-#
-#  $ cp make/config.mk .
-#
-#  Next modify the according entries, and then compile by
-#
-#  $ make
-#
-#  or build in parallel with 8 threads
-#
-#  $ make -j8
-#-------------------------------------------------------------------------------
-
-#---------------------
-# For cross compilation we only explictily set a compiler when one is not already present.
-#--------------------
-
-ifndef CC
-export CC = gcc
-endif
-ifndef CXX
-export CXX = g++
-endif
-ifndef NVCC
-export NVCC = nvcc
-endif
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on segfault signal handler to log the stack trace
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
-
-# the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = /usr/local/cuda-9.0/targets/aarch64-linux
-
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 0
-
-# whether use CuDNN R3 library
-USE_CUDNN = 1
-
-#whether to use NCCL library
-USE_NCCL = 0
-#add the path to NCCL library
-USE_NCCL_PATH = NONE
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 0
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-#whether use libjpeg-turbo for image decode without OpenCV wrapper
-USE_LIBJPEG_TURBO = 0
-#add the path to libjpeg-turbo library
-USE_LIBJPEG_TURBO_PATH = NONE
-
-# use openmp for parallelization
-USE_OPENMP = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 0
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-UNAME_S := $(shell uname -s)
-USE_BLAS = openblas
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH =
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL only for BLAS, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-USE_SSE=0
-
-# Turn off F16C instruction set support
-USE_F16C=0
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 0
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 0
-
-#----------------------------
-# performance settings
-#----------------------------
-# Use operator tuning
-USE_OPERATOR_TUNING = 1
-
-# Use gperftools if found
-# Disable because of #8968
-USE_GPERFTOOLS = 0
-
-# path to gperftools (tcmalloc) library in case of a non-standard installation
-USE_GPERFTOOLS_PATH =
-
-# Use JEMalloc if found, and not using gperftools
-USE_JEMALLOC = 1
-
-# path to jemalloc library in case of a non-standard installation
-USE_JEMALLOC_PATH =
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-#----------------------------
-# other features
-#----------------------------
-
-# Create C++ interface package
-USE_CPP_PACKAGE = 0
-
-# Use int64_t type to represent the total number of elements in the tensor
-# This will cause performance degradation reported in issue #14496
-# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
-# Note: the size of each dimension is still bounded by INT32_MAX
-USE_INT64_TENSOR_SIZE = 0
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 0f64fbc51449..fed3215f965d 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -22,9 +22,9 @@
  * \file shuffle_op.cc
  * \brief Operator to shuffle elements of an NDArray
  */
-#if !defined (__ANDROID__) && ((__GNUC__ > 4 &&\
-    !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__))
-        #define USE_GNU_PARALLEL_SHUFFLE
+#if ((__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__)) && \
+  defined(_OPENMP) && !defined(__ANDROID__)
+#define USE_GNU_PARALLEL_SHUFFLE
 #endif
 
 #include <mxnet/operator_util.h>

From bd7f8cf191620bdda551aba9e80118c57b860d38 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 10 Apr 2020 19:01:06 +0000
Subject: [PATCH 02/14] Support platforms without rand_r

---
 .../multi_threaded_inference.cc               |  5 +-
 src/operator/contrib/dgl_graph.cc             | 73 +++++++++++++------
 src/operator/nn/mkldnn/mkldnn_rnn.cc          |  4 +-
 src/operator/rnn-inl.h                        | 20 +++--
 src/operator/rnn.cc                           |  1 +
 src/operator/rnn_impl.h                       | 43 +++++------
 tests/cpp/engine/threaded_engine_test.cc      | 14 ++--
 tests/cpp/thread_safety/thread_safety_test.cc | 16 ++--
 8 files changed, 113 insertions(+), 63 deletions(-)

diff --git a/example/multi_threaded_inference/multi_threaded_inference.cc b/example/multi_threaded_inference/multi_threaded_inference.cc
index e90d55307e53..8b1864feea93 100644
--- a/example/multi_threaded_inference/multi_threaded_inference.cc
+++ b/example/multi_threaded_inference/multi_threaded_inference.cc
@@ -34,6 +34,7 @@
 #include <opencv2/opencv.hpp>
 #include <mxnet/c_predict_api.h>
 #include "mxnet-cpp/MxNetCpp.h"
+#include <random>
 
 const float DEFAULT_MEAN = 117.0;
 
@@ -248,7 +249,9 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
   auto func = [&](int num) {
     unsigned next = num;
     if (random_sleep) {
-      int sleep_time = rand_r(&next) % 5;
+      static thread_local std::mt19937 generator;
+      std::uniform_int_distribution<int> distribution(0, 5);
+      int sleep_time = distribution(generator);
       std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
     }
     int num_output = 0;
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 428899791a5d..89bee8abf655 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -24,6 +24,9 @@
 #include <mxnet/operator_util.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
+#include <algorithm>
+#include <random>
+
 #include "../elemwise_op_common.h"
 #include "../../imperative/imperative_utils.h"
 #include "../subgraph_op_common.h"
@@ -41,7 +44,9 @@ typedef int64_t dgl_id_t;
  */
 class ArrayHeap {
  public:
-  explicit ArrayHeap(const std::vector<float>& prob) {
+  explicit ArrayHeap(const std::vector<float>& prob, unsigned int seed) {
+    generator_ = std::mt19937(seed);
+    distribution_ = std::uniform_real_distribution<float>(0.0, 1.0);
     vec_size_ = prob.size();
     bit_len_ = ceil(log2(vec_size_));
     limit_ = 1 << bit_len_;
@@ -86,8 +91,8 @@ class ArrayHeap {
   /*
    * Sample from arrayHeap
    */
-  size_t Sample(unsigned int* seed) {
-    float xi = heap_[1] * (rand_r(seed)%100/101.0);
+  size_t Sample() {
+    float xi = heap_[1] * distribution_(generator_);
     int i = 1;
     while (i < limit_) {
       i = i << 1;
@@ -102,10 +107,10 @@ class ArrayHeap {
   /*
    * Sample a vector by given the size n
    */
-  void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples, unsigned int* seed) {
+  void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples) {
     // sample n elements
     for (size_t i = 0; i < n; ++i) {
-      samples->at(i) = this->Sample(seed);
+      samples->at(i) = this->Sample();
       this->Delete(samples->at(i));
     }
   }
@@ -115,6 +120,8 @@ class ArrayHeap {
   int bit_len_;   // bit size
   int limit_;
   std::vector<float> heap_;
+  std::mt19937 generator_;
+  std::uniform_real_distribution<float> distribution_;
 };
 
 struct NeighborSampleParam : public dmlc::Parameter<NeighborSampleParam> {
@@ -402,10 +409,12 @@ static bool CSRNeighborNonUniformSampleType(const nnvm::NodeAttrs& attrs,
 static void RandomSample(size_t set_size,
                          size_t num,
                          std::vector<size_t>* out,
-                         unsigned int* seed) {
+                         unsigned int seed) {
+  std::mt19937 generator(seed);
   std::unordered_set<size_t> sampled_idxs;
+  std::uniform_int_distribution<size_t> distribution(0, set_size - 1);
   while (sampled_idxs.size() < num) {
-    sampled_idxs.insert(rand_r(seed) % set_size);
+    sampled_idxs.insert(distribution(generator));
   }
   out->clear();
   for (auto it = sampled_idxs.begin(); it != sampled_idxs.end(); it++) {
@@ -441,7 +450,7 @@ static void GetUniformSample(const dgl_id_t* val_list,
                              const size_t max_num_neighbor,
                              std::vector<dgl_id_t>* out_ver,
                              std::vector<dgl_id_t>* out_edge,
-                             unsigned int* seed) {
+                             unsigned int seed) {
   // Copy ver_list to output
   if (ver_len <= max_num_neighbor) {
     for (size_t i = 0; i < ver_len; ++i) {
@@ -485,7 +494,7 @@ static void GetNonUniformSample(const float* probability,
                                 const size_t max_num_neighbor,
                                 std::vector<dgl_id_t>* out_ver,
                                 std::vector<dgl_id_t>* out_edge,
-                                unsigned int* seed) {
+                                unsigned int seed) {
   // Copy ver_list to output
   if (ver_len <= max_num_neighbor) {
     for (size_t i = 0; i < ver_len; ++i) {
@@ -500,8 +509,8 @@ static void GetNonUniformSample(const float* probability,
   for (size_t i = 0; i < ver_len; ++i) {
     sp_prob[i] = probability[col_list[i]];
   }
-  ArrayHeap arrayHeap(sp_prob);
-  arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index, seed);
+  ArrayHeap arrayHeap(sp_prob, seed);
+  arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index);
   out_ver->resize(max_num_neighbor);
   out_edge->resize(max_num_neighbor);
   for (size_t i = 0; i < max_num_neighbor; ++i) {
@@ -536,8 +545,8 @@ static void SampleSubgraph(const NDArray &csr,
                            const float* probability,
                            int num_hops,
                            size_t num_neighbor,
-                           size_t max_num_vertices) {
-  unsigned int time_seed = time(nullptr);
+                           size_t max_num_vertices,
+                           unsigned int random_seed) {
   size_t num_seeds = seed_arr.shape().Size();
   CHECK_GE(max_num_vertices, num_seeds);
 
@@ -594,7 +603,7 @@ static void SampleSubgraph(const NDArray &csr,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
-                       &time_seed);
+                       random_seed);
     } else {  // non-uniform-sample
       GetNonUniformSample(probability,
                        val_list + *(indptr + dst_id),
@@ -603,7 +612,7 @@ static void SampleSubgraph(const NDArray &csr,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
-                       &time_seed);
+                       random_seed);
     }
     CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size());
     size_t pos = neighbor_list.size();
@@ -720,12 +729,15 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
                                           const std::vector<NDArray>& inputs,
                                           const std::vector<OpReqType>& req,
                                           const std::vector<NDArray>& outputs) {
-  const NeighborSampleParam& params =
-    nnvm::get<NeighborSampleParam>(attrs.parsed);
+  const NeighborSampleParam& params = nnvm::get<NeighborSampleParam>(attrs.parsed);
 
   int num_subgraphs = inputs.size() - 1;
   CHECK_EQ(outputs.size(), 3 * num_subgraphs);
 
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  mshadow::Random<cpu, unsigned int> *prnd = ctx.requested[0].get_random<cpu, unsigned int>(s);
+  unsigned int seed = prnd->GetRandInt();
+
 #pragma omp parallel for
   for (int i = 0; i < num_subgraphs; i++) {
     SampleSubgraph(inputs[0],                     // graph_csr
@@ -737,7 +749,12 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
                    nullptr,                       // probability
                    params.num_hops,
                    params.num_neighbor,
-                   params.max_num_vertices);
+                   params.max_num_vertices,
+#if defined(_OPENMP)
+                   seed + omp_get_thread_num());
+#else
+                   seed);
+#endif
   }
 }
 
@@ -798,6 +815,9 @@ of max_num_vertices, and the valid number of vertices is the same as the ones in
 .set_attr<mxnet::FInferShape>("FInferShape", CSRNeighborUniformSampleShape)
 .set_attr<nnvm::FInferType>("FInferType", CSRNeighborUniformSampleType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborUniformSampleComputeExCPU)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kRandom};
+})
 .add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
 .add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
 .set_attr<std::string>("key_var_num_args", "num_args")
@@ -811,14 +831,17 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
                                               const std::vector<NDArray>& inputs,
                                               const std::vector<OpReqType>& req,
                                               const std::vector<NDArray>& outputs) {
-  const NeighborSampleParam& params =
-    nnvm::get<NeighborSampleParam>(attrs.parsed);
+  const NeighborSampleParam& params = nnvm::get<NeighborSampleParam>(attrs.parsed);
 
   int num_subgraphs = inputs.size() - 2;
   CHECK_EQ(outputs.size(), 4 * num_subgraphs);
 
   const float* probability = inputs[1].data().dptr<float>();
 
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  mshadow::Random<cpu, unsigned int> *prnd = ctx.requested[0].get_random<cpu, unsigned int>(s);
+  unsigned int seed = prnd->GetRandInt();
+
 #pragma omp parallel for
   for (int i = 0; i < num_subgraphs; i++) {
     float* sub_prob = outputs[i+2*num_subgraphs].data().dptr<float>();
@@ -831,7 +854,12 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
                    probability,
                    params.num_hops,
                    params.num_neighbor,
-                   params.max_num_vertices);
+                   params.max_num_vertices,
+#if defined(_OPENMP)
+                   seed + omp_get_thread_num());
+#else
+                   seed);
+#endif
   }
 }
 
@@ -897,6 +925,9 @@ of max_num_vertices, and the valid number of vertices is the same as the ones in
 .set_attr<mxnet::FInferShape>("FInferShape", CSRNeighborNonUniformSampleShape)
 .set_attr<nnvm::FInferType>("FInferType", CSRNeighborNonUniformSampleType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborNonUniformSampleComputeExCPU)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kRandom};
+})
 .add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
 .add_argument("probability", "NDArray-or-Symbol", "probability vector")
 .add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn.cc b/src/operator/nn/mkldnn/mkldnn_rnn.cc
index 5d3857e1c578..c8f1d45814f5 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn.cc
+++ b/src/operator/nn/mkldnn/mkldnn_rnn.cc
@@ -953,7 +953,7 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
                           const std::vector<NDArray> &inputs,
                           const std::vector<OpReqType> &req,
                           const std::vector<NDArray> &outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  TmpMemMgr::Get()->Init(ctx.requested[1]);
   // In the `autograd.record()` context, RNNOp is required to run into
   // forward_training mode.
   const bool is_training = (ctx.is_train || ctx.need_grad);
@@ -1076,7 +1076,7 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
                            const std::vector<OpReqType>& req,
                            const std::vector<NDArray>& outputs) {
   using tag = mkldnn::memory::format_tag;
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  TmpMemMgr::Get()->Init(ctx.requested[1]);
   const RNNParam& default_param = full_param_.default_param;
   const int data_dtype = inputs[rnn_enum::kData].dtype();
   const int w_dtype = inputs[rnn_enum::kParams].dtype();
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 557c1117739a..1bd351fe0a9c 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -31,6 +31,7 @@
 #include <mxnet/operator.h>
 #include <mxnet/storage.h>
 #include <algorithm>
+#include <random>
 #include <map>
 #include <vector>
 #include <string>
@@ -292,23 +293,24 @@ void RNNForwardTraining(DType* ws,
                         DType* hy_ptr,
                         DType* cy_ptr,
                         const float dropout,
-                        int mode) {
+                        int mode,
+                        std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   switch (mode) {
     case rnn_enum::kLstm:
       LstmForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                  batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr,
-                                 w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, dropout);
+                                 w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, dropout, rnd_engine);
       break;
     case rnn_enum::kGru:
       GruForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                 batch_size, input_size, state_size, x_ptr, hx_ptr,
-                                w_ptr, y_ptr, hy_ptr, dropout);
+                                w_ptr, y_ptr, hy_ptr, dropout, rnd_engine);
       break;
     case rnn_enum::kRnnTanh:
     case rnn_enum::kRnnRelu:
       VanillaRNNForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                        batch_size, input_size, state_size, x_ptr, hx_ptr,
-                                       w_ptr, y_ptr, hy_ptr, dropout, mode);
+                                       w_ptr, y_ptr, hy_ptr, dropout, mode, rnd_engine);
       break;
     default:
       LOG(FATAL) << "unknown RNN mode " << mode;
@@ -841,7 +843,8 @@ class RNNOp {
     }
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
-    if (ctx_.dev_type == kCPU) {
+#if !defined(__CUDACC__)  // cuda doesn't support C++17
+    if constexpr (std::is_same<xpu, cpu>::value) {
       int projection_size = 0;
       if (param_.projection_size.has_value()) {
         projection_size = param_.projection_size.value();
@@ -859,6 +862,9 @@ class RNNOp {
       DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
 
       if (ctx.is_train || ctx.need_grad) {
+        mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
+        std::mt19937 &rnd_engine = prnd->GetRndEngine();
+
         // allocate reserve space
         if (param_.projection_size.has_value()) {
           LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
@@ -893,7 +899,8 @@ class RNNOp {
                                   hy_ptr,
                                   cy_ptr,
                                   param_.p,
-                                  param_.mode);
+                                  param_.mode,
+                                  rnd_engine);
       } else {
         RNNForwardInference<DType>(work_cpu_space,
                                    param_.state_outputs,
@@ -915,6 +922,7 @@ class RNNOp {
                                    param_.mode);
       }
     }
+#endif
   }
 
   void Backward(const OpContext &ctx,
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index ac5e17d49133..efebc915a0e7 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -183,6 +183,7 @@ static std::vector<ResourceRequest> RNNResourceEx(const NodeAttrs& attrs, const
     }
 #endif
   } else {
+    request.emplace_back(ResourceRequest::kRandom);
 #if MXNET_USE_MKLDNN == 1
     request.emplace_back(ResourceRequest::kTempSpace);
 #endif
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index 008ba7d315c6..08d069801079 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -30,6 +30,7 @@
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <algorithm>
+#include <random>
 #include <map>
 #include <vector>
 #include <string>
@@ -139,17 +140,17 @@ void LstmForwardTraining(DType* ws,
                          DType* y_ptr,
                          DType* hy_ptr,
                          DType* cy_ptr,
-                         const float dropout) {
+                         const float dropout,
+                         std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* dropout_random = rs;
   DType* rs2 = dropout_random + (L - 1) * D * T * N * H;
   const int total_layers = D * L;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int r_size = D * T * N * H * 6;
-  const int y_offset = T * N * H * 5;
-  const int cell_size = N * H;
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  const index_t b_size = 2 * H * 4;
+  const index_t r_size = D * T * N * H * 6;
+  const index_t y_offset = T * N * H * 5;
+  const index_t cell_size = N * H;
   int idx = 0;  // state & cell state's idx;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < L; ++i) {
@@ -174,10 +175,9 @@ void LstmForwardTraining(DType* ws,
       w_ptr += w_size;
       b_ptr += b_size;
       if (dropout > 0.0f) {
-        #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < T * N * H * D; j++) {
-          int rand_data = rand_r(&seed_);
-          if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+        std::uniform_real_distribution<float> distribution(0, 1);
+        for (index_t j = 0; j < T * N * H * D; j++) {
+          if (distribution(rnd_engine) < dropout) {
             dropout_random[i * T * N * H * D + j] = 0;
             y.dptr_[j] = 0;
           } else {
@@ -995,7 +995,8 @@ void GruForwardTraining(DType* ws,
                         DType* w_ptr,
                         DType* y_ptr,
                         DType* hy_ptr,
-                        const float dropout) {
+                        const float dropout,
+                        std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* wx = w_ptr;
   DType* wh = wx + I * H * 3;
   DType* bx = wh + H * H * 3 + (D - 1) * (H * H * 3 + I * H * 3)
@@ -1016,18 +1017,15 @@ void GruForwardTraining(DType* ws,
   DType* bx_l = bx;
   DType* bh_l = bh;
   DType* y_tmp = x_ptr;
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   for (int l = 0; l < L; l++) {
     if (l != 0) {
       y_tmp = y_l;
       y_l = y_l + T * N * H * D;
     }
     if (dropout > 0.0f && l > 0) {
-      const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-      #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
-        int rand_data = rand_r(&seed_);
-        if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+      std::uniform_real_distribution<float> distribution(0, 1);
+      for (index_t i = 0; i < T * N * I; i++) {
+        if (distribution(rnd_engine) < dropout) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
           y_tmp[i] = 0;
         } else {
@@ -1884,7 +1882,8 @@ void VanillaRNNForwardTraining(DType* ws,
                                DType* y_ptr,
                                DType* hy_ptr,
                                const float dropout,
-                               int mode) {
+                               int mode,
+                               std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* wx = w_ptr;
   DType* wh = wx + I * H;
   DType* bx = wh + H * H + (D - 1) * (H * H + I * H)
@@ -1903,17 +1902,15 @@ void VanillaRNNForwardTraining(DType* ws,
   DType* bh_l = bh;
   DType* y_tmp = x_ptr;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   for (int l = 0; l < L; l++) {
     if (l != 0) {
       y_tmp = y_l;
       y_l = y_l + T * N * H * D;
     }
     if (dropout > 0.0f && l > 0) {
-      #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
-        int rand_data = rand_r(&seed_);
-        if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+      std::uniform_real_distribution<float> distribution(0, 1);
+      for (index_t i = 0; i < T * N * I; i++) {
+        if (distribution(rnd_engine) < dropout) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
           y_tmp[i] = 0;
         } else {
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index cea92a01e799..e1e3a53e656c 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -35,6 +35,7 @@
 #include <thread>
 #include <chrono>
 #include <vector>
+#include <random>
 
 #include "../src/engine/engine_impl.h"
 #include "../include/test_util.h"
@@ -62,15 +63,18 @@ void GenerateWorkload(int num_workloads, int num_var,
                       std::vector<Workload>* workloads) {
   workloads->clear();
   workloads->resize(num_workloads);
+  static thread_local std::mt19937 generator;
+  std::uniform_int_distribution<int> distribution_var(0, num_var - 1);
+  std::uniform_int_distribution<int> distribution_time(min_time, max_time - 1);
+  std::uniform_int_distribution<int> distribution_read(min_read, max_read - 1);
   for (int i = 0; i < num_workloads; ++i) {
     auto& wl = workloads->at(i);
-    wl.write = rand_r(&seed_) % num_var;
-    int r = rand_r(&seed_);
-    int num_read = min_read + (r % (max_read - min_read));
+    wl.write = distribution_var(generator);
+    int num_read = distribution_read(generator);
     for (int j = 0; j < num_read; ++j) {
-      wl.reads.push_back(rand_r(&seed_) % num_var);
+      wl.reads.push_back(distribution_var(generator));
     }
-    wl.time = min_time + rand_r(&seed_) % (max_time - min_time);
+    wl.time = distribution_time(generator);
   }
 }
 
diff --git a/tests/cpp/thread_safety/thread_safety_test.cc b/tests/cpp/thread_safety/thread_safety_test.cc
index 1f811d8c3fd7..9566adfd9d13 100644
--- a/tests/cpp/thread_safety/thread_safety_test.cc
+++ b/tests/cpp/thread_safety/thread_safety_test.cc
@@ -25,15 +25,17 @@
 #if MXNET_USE_CPP_PACKAGE == 1
 #include <stdio.h>
 #include <gtest/gtest.h>
-#include <mxnet/op_attr_types.h>
 #include <mxnet/ndarray.h>
-#include <thread>
+#include <mxnet/op_attr_types.h>
 #include <chrono>
 #include <cstdlib>
+#include <random>
+#include <thread>
 #include "../src/engine/engine_impl.h"
 #include "../src/imperative/imperative_utils.h"
 #include "../include/test_util.h"
 #include "mxnet-cpp/MxNetCpp.h"
+
 /*
  * Prepares input data for the ops/models used in this file
  */
@@ -298,8 +300,10 @@ void run_inference(const std::string& model,
       unsigned next = num;
       for (size_t i = 0; i < num_inf_per_thread; ++i) {
         if (random_sleep) {
-            int sleep_time = rand_r(&next) % 5;
-            std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
+          static thread_local std::mt19937 generator;
+          std::uniform_int_distribution<int> distribution(0, 5);
+          int sleep_time = distribution(generator);
+          std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
         }
         int num_output = 0;
         const int *stypes;
@@ -479,7 +483,9 @@ void run_inference_unsupported(const std::string& model,
       unsigned next = num;
       for (size_t i = 0; i < num_inf_per_thread; ++i) {
         if (random_sleep) {
-          int sleep_time = rand_r(&next) % 5;
+          static thread_local std::mt19937 generator;
+          std::uniform_int_distribution<int> distribution(0, 5);
+          int sleep_time = distribution(generator);
           std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
         }
         int num_output = 0;

From dcdbe3b23e4c64f5d22dccec497a423d96dc0b01 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Thu, 21 May 2020 01:07:55 +0300
Subject: [PATCH 03/14] Fix the URL to the IUS repository

---
 ci/docker/install/centos7_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/install/centos7_python.sh b/ci/docker/install/centos7_python.sh
index 06c53bea48c1..796387e1b2ee 100755
--- a/ci/docker/install/centos7_python.sh
+++ b/ci/docker/install/centos7_python.sh
@@ -23,7 +23,7 @@
 set -ex
 
  # Python 2.7 is installed by default, install 3.6 on top
-yum -y install https://centos7.iuscommunity.org/ius-release.rpm
+yum -y install https://repo.ius.io/ius-release-el7.rpm
 yum -y install python36u
 
 # Install PIP

From ef0f14366d459db322ed73c1853ef37f9d93b6b8 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Tue, 7 Apr 2020 01:36:27 +0000
Subject: [PATCH 04/14] compiler warnings

---
 3rdparty/mshadow/mshadow/logging.h    | 5 +++++
 3rdparty/mshadow/mshadow/packet-inl.h | 4 ++++
 Makefile                              | 4 +++-
 ci/docker/runtime_functions.sh        | 2 --
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/logging.h b/3rdparty/mshadow/mshadow/logging.h
index 666ca587b3bc..b639308f1c72 100644
--- a/3rdparty/mshadow/mshadow/logging.h
+++ b/3rdparty/mshadow/mshadow/logging.h
@@ -223,7 +223,12 @@ class LogMessageFatal {
   ~LogMessageFatal() MSHADOW_THROW_EXCEPTION {
     // throwing out of destructor is evil
     // hopefully we can do it here
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wterminate"
+#endif
     throw Error(log_stream_.str());
+#pragma GCC diagnostic pop
   }
 
  private:
diff --git a/3rdparty/mshadow/mshadow/packet-inl.h b/3rdparty/mshadow/mshadow/packet-inl.h
index 1b3d11a34114..e517c8facf6e 100644
--- a/3rdparty/mshadow/mshadow/packet-inl.h
+++ b/3rdparty/mshadow/mshadow/packet-inl.h
@@ -93,7 +93,11 @@ inline void* AlignedMallocPitch(size_t *out_pitch,
   if (res == NULL) {
     LOG(FATAL) << "AlignedMallocPitch failed";
   }
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
   return res;
+#pragma GCC diagnostic pop
 }
 
 /*!
diff --git a/Makefile b/Makefile
index c050dae5e45a..ad17675ce583 100644
--- a/Makefile
+++ b/Makefile
@@ -99,7 +99,9 @@ CFLAGS += -DDMLC_LOG_STACK_TRACE_SIZE=0
 CFLAGS += -DDMLC_LOG_FATAL_THROW=1
 
 ifeq ($(DEV), 1)
-	CFLAGS += -g -Werror
+  # Excluded from Werror:
+  # 1) variables used in '#pragma omp parallel' are considered unused
+	CFLAGS += -g -Werror -Wno-error=unused-variable -Wno-error=maybe-uninitialized -Wno-error=unused-function
 	NVCCFLAGS += -Werror cross-execution-space-call
 endif
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index ae55c12fb5ac..587a4088fd7d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -782,7 +782,6 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     set -ex
     build_ccache_wrappers
     make \
-        DEV=1                                     \
         USE_BLAS=openblas                         \
         USE_MKLDNN=0                              \
         USE_CUDA=1                                \
@@ -801,7 +800,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     set -ex
     build_ccache_wrappers
     make \
-        DEV=1                                     \
         USE_BLAS=openblas                         \
         USE_MKLDNN=1                              \
         USE_CUDA=1                                \

From 62158acd106af6c7c1ebe8cac53ee182b0636aee Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Thu, 21 May 2020 02:25:35 +0300
Subject: [PATCH 05/14] Use a pre-c++17 way of distinguishing between device
 types

---
 src/operator/rnn-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 1bd351fe0a9c..180e945000e4 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -844,7 +844,7 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
 #if !defined(__CUDACC__)  // cuda doesn't support C++17
-    if constexpr (std::is_same<xpu, cpu>::value) {
+    if (ctx_.dev_type == kCPU) {
       int projection_size = 0;
       if (param_.projection_size.has_value()) {
         projection_size = param_.projection_size.value();

From b9bb0195a7870f1926ec9ad12c5ab89d8a6acecd Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Sat, 11 Apr 2020 22:59:51 +0000
Subject: [PATCH 06/14] Greatly simplify qemu setup

---
 ci/README.md                                  |  95 +----
 ci/dev_menu.py                                |   2 +-
 ci/docker/Dockerfile.build.android_armv7      |   4 +-
 ci/docker/Dockerfile.build.android_armv8      |   4 +-
 ci/docker/Dockerfile.build.armv6              |   4 +-
 ci/docker/Dockerfile.build.armv7              |   4 +-
 ci/docker/Dockerfile.build.armv8              |   4 +-
 ...t.arm_qemu => Dockerfile.build.test.armv7} |  32 +-
 ...rm_qemu.sh => Dockerfile.build.test.armv8} |  38 +-
 ci/docker/install/ubuntu_arm_qemu_bin.sh      |  40 --
 ci/docker/qemu/README.md                      |  18 -
 ci/docker/qemu/runtime_functions.py           | 134 -------
 ci/docker/qemu/vmcontrol.py                   | 360 ------------------
 ci/docker/runtime_functions.sh                |  12 +
 ci/jenkins/Jenkins_steps.groovy               |  40 +-
 ci/jenkins/Jenkinsfile_edge                   |   7 +-
 ci/qemu/README.md                             |  92 -----
 ci/qemu/copy.sh                               |  23 --
 ci/qemu/init.sh                               |  23 --
 ci/qemu/initrd_modif/inittab                  |  38 --
 ci/qemu/install.sh                            |  32 --
 ci/qemu/mxnet_requirements.txt                |   7 -
 ci/qemu/preseed.cfg                           |  68 ----
 ci/qemu/preseed.sh                            |  29 --
 ci/qemu/run.sh                                |  33 --
 ci/qemu/test_requirements.txt                 |   3 -
 26 files changed, 108 insertions(+), 1038 deletions(-)
 rename ci/docker/{Dockerfile.build.test.arm_qemu => Dockerfile.build.test.armv7} (67%)
 rename ci/docker/{install/ubuntu_arm_qemu.sh => Dockerfile.build.test.armv8} (60%)
 mode change 100755 => 100644
 delete mode 100755 ci/docker/install/ubuntu_arm_qemu_bin.sh
 delete mode 100644 ci/docker/qemu/README.md
 delete mode 100755 ci/docker/qemu/runtime_functions.py
 delete mode 100644 ci/docker/qemu/vmcontrol.py
 delete mode 100644 ci/qemu/README.md
 delete mode 100755 ci/qemu/copy.sh
 delete mode 100755 ci/qemu/init.sh
 delete mode 100644 ci/qemu/initrd_modif/inittab
 delete mode 100755 ci/qemu/install.sh
 delete mode 100644 ci/qemu/mxnet_requirements.txt
 delete mode 100644 ci/qemu/preseed.cfg
 delete mode 100755 ci/qemu/preseed.sh
 delete mode 100755 ci/qemu/run.sh
 delete mode 100644 ci/qemu/test_requirements.txt

diff --git a/ci/README.md b/ci/README.md
index 155a0104a125..7172bd955491 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -111,90 +111,37 @@ significantly. You can set this directory explicitly by setting CCACHE_DIR envir
 variable. All ccache instances are currently set to be 10 Gigabytes max in size.
 
 
-## Testing with QEMU
-To run the unit tests under qemu:
-```
-./build.py -p armv7 && ./build.py -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu
-```
-
-To get a shell on the container and debug issues with the emulator itself, we build the container
-and then execute it interactively. We can afterwards use port 2222 on the host to connect with SSH.
-
-
-```
-ci/build.py -p test.arm_qemu -b && docker run -p2222:2222 -ti mxnetci/build.test.arm_qemu
-```
+## Testing with ARM / Edge devices with QEMU
 
-Then from another terminal:
+We build on [QEMU](https://www.qemu.org/) and Linux [Kernel Support for
+miscellaneous Binary
+Formats](https://www.kernel.org/doc/html/v5.6/admin-guide/binfmt-misc.html) for
+testing MXNet on edge devices. Test can be invoked with the same syntax as for
+non-virtualized platforms:
 
 ```
-ssh -o StrictHostKeyChecking=no -p 2222 qemu@localhost
+./build.py -p armv7
+./build.py -p test.armv7 /work/runtime_functions.sh unittest_ubuntu_python3_armv7
 ```
 
-There are two pre-configured users: `root` and `qemu` both without passwords.
-
-
-### Example of reproducing a test result with QEMU on ARM
-
-
-You might want to enable a debug build first:
-
-```
-$ git diff
-diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
-index 39631f9..666ceea 100755
---- a/ci/docker/runtime_functions.sh
-+++ b/ci/docker/runtime_functions.sh
-@@ -172,6 +172,7 @@ build_armv7() {
-         -DUSE_LAPACK=OFF \
-         -DBUILD_CPP_EXAMPLES=OFF \
-         -Dmxnet_LINKER_LIBS=-lgfortran \
-+        -DCMAKE_BUILD_TYPE=Debug \
-         -G Ninja /work/mxnet
-
-     ninja -v
+For the test step to succeed, you must run Linux kernel 4.8 or later and have qemu installed.
 
+On Debian and Ubuntu systems, run the following command to install the dependencies:
 ```
+sudo apt install binfmt-support qemu-user-static
 
-Then we build the project for armv7, the test container and start QEMU inside docker:
-
-```
-ci/build.py -p armv7
-ci/build.py -p test.arm_qemu -b && docker run -p2222:2222 -ti mxnetci/build.test.arm_qemu
+# Use qemu-binfmt-conf.sh to register all binary types with the kernel
+wget https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh
+chmod +x qemu-binfmt-conf.sh
+sudo ./qemu-binfmt-conf.sh --persistent yes --qemu-suffix "-static" --qemu-path "/usr/bin" --systemd ALL
 ```
 
-
-
-At this point we copy artifacts and sources to the VM, in another terminal (host) do the following:
+If you run into segmentation faults at the beginning of the emulated tests, you
+probably have a ancient version of Qemu on your system (or found a bug in
+upstream Qemu). In that situation, you can rely on the
+`multiarch/qemu-user-static` Docker project to register a set of up-to-date Qemu
+binaries from their Docker image with your kernel:
 
 ```
-# Copy mxnet sources to the VM
-rsync --delete -e 'ssh -p2222' --exclude='.git/' -zvaP ./ qemu@localhost:mxnet
-
-
-# Ssh into the vm
-ssh -p2222 qemu@localhost
-
-cd mxnet
-
-# Execute a single failing C++ test
-build/tests/mxnet_unit_tests --gtest_filter="ACTIVATION_PERF.ExecuteBidirectional"
-
-# To install MXNet:
-sudo pip3 install --upgrade --force-reinstall build/mxnet-1.3.1-py2.py3-none-any.whl
-
-# Execute a single python test:
-
-nosetests-3.4 -v -s tests/python/unittest/test_ndarray.py
-
-
-# Debug with cgdb
-sudo apt install -y libstdc++6-6-dbg
-cgdb build/tests/mxnet_unit_tests
-
-(gdb) !pwd
-/home/qemu/mxnet
-(gdb) set substitute-path /work /home/qemu
-(gdb) set substitute-path /build/gcc-6-6mK9AW/gcc-6-6.3.0/build/arm-linux-gnueabihf/libstdc++-v3/include/ /usr/include/c++/6/
-(gdb) r --gtest_filter="ACTIVATION_PERF.ExecuteBidirectional"
+docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
 ```
diff --git a/ci/dev_menu.py b/ci/dev_menu.py
index e9f031e1b171..962e4ecfe03f 100755
--- a/ci/dev_menu.py
+++ b/ci/dev_menu.py
@@ -167,7 +167,7 @@ def provision_virtualenv(venv_path=DEFAULT_PYENV):
     ('[Docker] Python3 ARMv7 unittests (QEMU)',
     [
         "ci/build.py -p armv7",
-        "ci/build.py -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
+        "ci/build.py -p test.armv7 /work/runtime_functions.sh unittest_ubuntu_python3_armv7"
     ]),
     ('Clean (RESET HARD) repository (Warning! erases local changes / DATA LOSS)',
        Confirm("ci/docker/runtime_functions.sh clean_repo"))
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 96ca04e9f5e6..8d9fb6481e2e 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -46,10 +46,10 @@ RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
     mkdir /usr/local/openblas-android && \
     cd /usr/local/OpenBLAS && \
     export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
-    make NOFORTRAN=1 ARM_SOFTFP_ABI=1 \
+    make NOFORTRAN=1 ARM_SOFTFP_ABI=1 NO_SHARED=1 \
         LDFLAGS="-L/usr/local/android-ndk-r19/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x -lm" \
         CC=$TOOLCHAIN/bin/armv7a-linux-androideabi16-clang AR=$TOOLCHAIN/bin/arm-linux-androideabi-ar && \
-    make PREFIX=/usr/local/openblas-android install && \
+    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 ENV OpenBLAS_HOME=/usr/local/openblas-android
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index 81adc80edf14..a78113a33bae 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -46,10 +46,10 @@ RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
     mkdir /usr/local/openblas-android && \
     cd /usr/local/OpenBLAS && \
     export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
-    make NOFORTRAN=1 \
+    make NOFORTRAN=1 NO_SHARED=1 \
         LDFLAGS="-L/usr/local/android-ndk-r21/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x -lm" \
         CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang AR=$TOOLCHAIN/bin/aarch64-linux-android-ar && \
-    make PREFIX=/usr/local/openblas-android install && \
+    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 ENV OpenBLAS_HOME=/usr/local/openblas-android
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 02e16da11616..83186369d829 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -50,8 +50,8 @@ ENV CMAKE_TOOLCHAIN_FILE=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/share/
 
 RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
     cd /usr/local/OpenBLAS && \
-    make NOFORTRAN=1 CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
-    make PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot install && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
+    make PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index a9cc6d1e83a4..d207d79485ae 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -45,8 +45,8 @@ ENV CMAKE_TOOLCHAIN_FILE=/usr/local/arm-linux-gnueabihf-toolchain.cmake
 
 RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
     cd /usr/local/OpenBLAS && \
-    make NOFORTRAN=1 CC=arm-linux-gnueabihf-gcc && \
-    make PREFIX=/usr/local/arm-linux-gnueabihf install && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=arm-linux-gnueabihf-gcc && \
+    make PREFIX=/usr/local/arm-linux-gnueabihf NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index adf6873fb40c..d318cc2f02d4 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -45,8 +45,8 @@ ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
 
 RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
     cd /usr/local/OpenBLAS && \
-    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
-    make PREFIX=/usr/aarch64-linux-gnu install && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
diff --git a/ci/docker/Dockerfile.build.test.arm_qemu b/ci/docker/Dockerfile.build.test.armv7
similarity index 67%
rename from ci/docker/Dockerfile.build.test.arm_qemu
rename to ci/docker/Dockerfile.build.test.armv7
index 5dc610a524b0..d49e7a5582c1 100644
--- a/ci/docker/Dockerfile.build.test.arm_qemu
+++ b/ci/docker/Dockerfile.build.test.armv7
@@ -16,22 +16,21 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to build and run MXNet on Ubuntu 16.04 for CPU
+# Dockerfile to test MXNet on Ubuntu 20.04 ARMv7 CPU
 
-FROM ubuntu:16.04
+FROM arm32v7/ubuntu:20.04
 
-WORKDIR /work
+WORKDIR /usr/local
 
-RUN apt-get update
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_arm_qemu.sh /work
-RUN /work/ubuntu_arm_qemu.sh
-
-COPY install/ubuntu_arm_qemu_bin.sh /work
-RUN /work/ubuntu_arm_qemu_bin.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    python3-scipy \
+    python3-nose \
+    python3-nose-timer \
+    python3-requests \
+ && rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -39,9 +38,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-COPY qemu/* /work/
-
-# SSH to the Qemu VM
-EXPOSE 2222/tcp
-
-CMD ["./runtime_functions.py","run_qemu_interactive"]
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/install/ubuntu_arm_qemu.sh b/ci/docker/Dockerfile.build.test.armv8
old mode 100755
new mode 100644
similarity index 60%
rename from ci/docker/install/ubuntu_arm_qemu.sh
rename to ci/docker/Dockerfile.build.test.armv8
index 79ab67bfdbe6..bee4d85c6a97
--- a/ci/docker/install/ubuntu_arm_qemu.sh
+++ b/ci/docker/Dockerfile.build.test.armv8
@@ -1,5 +1,4 @@
-#!/usr/bin/env bash
-
+# -*- mode: dockerfile -*-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,22 +15,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+#
+# Dockerfile to test MXNet on Ubuntu 20.04 ARMv8 CPU
+
+FROM arm64v8/ubuntu:20.04
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+WORKDIR /usr/local
 
-set -exuo pipefail
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    python3-scipy \
+    python3-nose \
+    python3-nose-timer \
+    python3-requests \
+ && rm -rf /var/lib/apt/lists/*
 
-apt-get install -y \
-    cmake \
-    curl \
-    wget \
-    git \
-    qemu \
-    qemu-system-arm \
-    unzip \
-    bzip2 \
-    vim-nox \
-    toilet
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
-pip3 install ipython
+COPY runtime_functions.sh /work/
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/install/ubuntu_arm_qemu_bin.sh b/ci/docker/install/ubuntu_arm_qemu_bin.sh
deleted file mode 100755
index d4f81185c169..000000000000
--- a/ci/docker/install/ubuntu_arm_qemu_bin.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -exuo pipefail
-
-#
-# This disk image and kernels for virtual testing with QEMU  is generated with some manual OS
-# installation steps with the scripts and documentation found in the ci/qemu/ folder.
-#
-# The image has a base Debian OS and MXNet runtime dependencies installed.
-# The root password is empty and there's a "qemu" user without password. SSH access is enabled as
-# well.
-#
-# See also: ci/qemu/README.md
-#
-
-REMOTE="https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data"
-curl -f ${REMOTE}/vda_debian_stretch.qcow2.bz2 | bunzip2 > vda.qcow2
-curl -f ${REMOTE}/vmlinuz -o vmlinuz
-curl -f ${REMOTE}/initrd.img -o initrd.img
-
diff --git a/ci/docker/qemu/README.md b/ci/docker/qemu/README.md
deleted file mode 100644
index c06b34562b57..000000000000
--- a/ci/docker/qemu/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-These are files used in the docker container that runs QEMU
diff --git a/ci/docker/qemu/runtime_functions.py b/ci/docker/qemu/runtime_functions.py
deleted file mode 100755
index 5a57cb8dae6a..000000000000
--- a/ci/docker/qemu/runtime_functions.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding: utf-8 -*-
-"""Runtime functions to use in docker / testing"""
-
-__author__ = 'Pedro Larroy'
-__version__ = '0.1'
-
-import os
-import sys
-import subprocess
-import argparse
-import logging
-from subprocess import call, check_call, Popen, DEVNULL, PIPE
-import time
-import sys
-import types
-import glob
-import vmcontrol
-from vmcontrol import qemu_ssh, qemu_provision, qemu_rsync_to_host, VM
-
-def activate_this(base):
-    import site
-    import os
-    import sys
-    if sys.platform == 'win32':
-        site_packages = os.path.join(base, 'Lib', 'site-packages')
-    else:
-        site_packages = os.path.join(base, 'lib', 'python%s' % sys.version[:3], 'site-packages')
-    prev_sys_path = list(sys.path)
-    sys.real_prefix = sys.prefix
-    sys.prefix = base
-    # Move the added items to the front of the path:
-    new_sys_path = []
-    for item in list(sys.path):
-        if item not in prev_sys_path:
-            new_sys_path.append(item)
-            sys.path.remove(item)
-    sys.path[:0] = new_sys_path
-
-
-
-
-def run_ut_py3_qemu():
-    """Run unit tests in the emulator and copy the results back to the host through the mounted
-    volume in /mxnet"""
-    from vmcontrol import VM
-    with VM() as vm:
-        qemu_provision(vm.ssh_port)
-        logging.info("execute tests")
-        qemu_ssh(vm.ssh_port, "./runtime_functions.py", "run_ut_python3_qemu_internal")
-        qemu_rsync_to_host(vm.ssh_port, "*.xml", "mxnet")
-        logging.info("copied to host")
-        logging.info("tests finished, vm shutdown.")
-        vm.shutdown()
-
-def run_ut_python3_qemu_internal():
-    """this runs inside the vm"""
-    pkg = glob.glob('mxnet_dist/*.whl')[0]
-    logging.info("=== NOW Running inside QEMU ===")
-    logging.info("PIP Installing %s", pkg)
-    check_call(['sudo', 'pip3', 'install', pkg])
-    logging.info("PIP Installing mxnet/test_requirements.txt") 
-    check_call(['sudo', 'pip3', 'install', '-r', 'mxnet/test_requirements.txt'])
-    logging.info("Running tests in mxnet/tests/python/unittest/")
-    check_call(['nosetests', '--with-timer', '--with-xunit', '--xunit-file', 'nosetests_unittest.xml', '--verbose', 'mxnet/tests/python/unittest/test_engine.py'])
-    # Example to run a single unit test:
-    # check_call(['nosetests', '--with-timer', '--with-xunit', '--xunit-file', 'nosetests_unittest.xml', '--verbose', 'mxnet/tests/python/unittest/test_ndarray.py:test_ndarray_fluent'])
-
-
-
-def run_qemu_interactive():
-    vm = VM(interactive=True)
-    vm.detach()
-    vm.start()
-    vm.wait()
-    logging.info("QEMU finished")
-
-################################
-
-def parsed_args():
-    parser = argparse.ArgumentParser(description="""python runtime functions""", epilog="")
-    parser.add_argument('command',nargs='*',
-        help="Name of the function to run with arguments")
-    args = parser.parse_args()
-    return (args, parser)
-
-def script_name() -> str:
-    return os.path.split(sys.argv[0])[1]
-
-def chdir_to_script_directory():
-    # We need to be in the same directory than the script so the commands in the dockerfiles work as
-    # expected. But the script can be invoked from a different path
-    base = os.path.split(os.path.realpath(__file__))[0]
-    os.chdir(base)
-
-def main():
-    logging.getLogger().setLevel(logging.INFO)
-    logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
-    chdir_to_script_directory()
-
-    # Run function with name passed as argument
-    (args, parser) = parsed_args()
-    logging.info("%s", args.command)
-    if args.command:
-        fargs = args.command[1:]
-        globals()[args.command[0]](*fargs)
-        return 0
-    else:
-        parser.print_help()
-        fnames = [x for x in globals() if type(globals()[x]) is types.FunctionType]
-        print('\nAvailable functions: {}'.format(' '.join(fnames)))
-        return 1
-
-if __name__ == '__main__':
-    sys.exit(main())
-
diff --git a/ci/docker/qemu/vmcontrol.py b/ci/docker/qemu/vmcontrol.py
deleted file mode 100644
index 31ef4d2550c3..000000000000
--- a/ci/docker/qemu/vmcontrol.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding: utf-8 -*-
-"""Utilities to control a guest VM, used for virtual testing with QEMU"""
-
-__author__ = 'Pedro Larroy'
-__version__ = '0.1'
-
-import os
-import sys
-import subprocess
-import argparse
-import logging
-from subprocess import call, check_call, Popen, DEVNULL, PIPE
-import time
-import sys
-import multiprocessing
-import shlex
-
-###################################################
-#
-# Virtual testing with QEMU
-#
-# We start QEMU instances that have a local port in the host redirected to the ssh port.
-#
-# The VMs are provisioned after boot, tests are run and then they are stopped
-#
-QEMU_SSH_PORT=2222
-QEMU_RAM=4096
-
-QEMU_RUN="""
-qemu-system-arm -M virt -m {ram} \
-  -kernel vmlinuz \
-  -initrd initrd.img \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::{ssh_port}-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -display none -nographic
-"""
-
-QEMU_RUN_INTERACTIVE="""
-qemu-system-arm -M virt -m {ram} \
-  -kernel vmlinuz \
-  -initrd initrd.img \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::{ssh_port}-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -nographic
-"""
-
-def retry(target_exception, tries=4, delay_s=1, backoff=2):
-    """Retry calling the decorated function using an exponential backoff.
-
-    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
-    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
-
-    :param target_exception: the exception to check. may be a tuple of
-        exceptions to check
-    :type target_exception: Exception or tuple
-    :param tries: number of times to try (not retry) before giving up
-    :type tries: int
-    :param delay_s: initial delay between retries in seconds
-    :type delay_s: int
-    :param backoff: backoff multiplier e.g. value of 2 will double the delay
-        each retry
-    :type backoff: int
-    """
-    import time
-    from functools import wraps
-
-    def decorated_retry(f):
-        @wraps(f)
-        def f_retry(*args, **kwargs):
-            mtries, mdelay = tries, delay_s
-            while mtries > 1:
-                try:
-                    return f(*args, **kwargs)
-                except target_exception as e:
-                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
-                    time.sleep(mdelay)
-                    mtries -= 1
-                    mdelay *= backoff
-            return f(*args, **kwargs)
-
-        return f_retry  # true decorator
-
-    return decorated_retry
-
-
-
-
-class VMError(RuntimeError):
-    pass
-
-class VM:
-    """Control of the virtual machine"""
-    def __init__(self, ssh_port=QEMU_SSH_PORT, ram=QEMU_RAM, interactive=False):
-        self.log = logging.getLogger(VM.__name__)
-        self.ssh_port = ssh_port
-        self.timeout_s = 300
-        self.qemu_process = None
-        self._detach = False
-        self._interactive = interactive
-        self.ram = ram
-
-    def __enter__(self):
-        self.start()
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if not self._detach:
-            self.shutdown()
-            self.terminate()
-
-    def start(self):
-        sys.stderr.flush()
-        call(['toilet', '-f', 'smbraille', 'Starting QEMU'])
-        sys.stdout.flush()
-        self.log.info("Starting VM, ssh port redirected to localhost:%s (inside docker, not exposed by default)", self.ssh_port)
-        if self.is_running():
-            raise VMError("VM is running, shutdown first")
-        if self._interactive:
-            self.qemu_process = Popen(shlex.split(QEMU_RUN_INTERACTIVE.format(ssh_port=self.ssh_port, ram=self.ram)))
-            return
-        else:
-            self.log.info("Starting in non-interactive mode. Terminal output is disabled.")
-            self.qemu_process = Popen(shlex.split(QEMU_RUN.format(ssh_port=self.ssh_port, ram=self.ram)), stdout=DEVNULL, stdin=DEVNULL, stderr=PIPE)
-        def keep_waiting():
-            return self.is_running()
-
-        logging.info("waiting for ssh to be open in the VM (timeout {}s)".format(self.timeout_s))
-        ssh_working = wait_ssh_open('127.0.0.1', self.ssh_port, keep_waiting, self.timeout_s)
-
-        if not self.is_running():
-            (_, stderr) = self.qemu_process.communicate()
-            raise VMError("VM failed to start, retcode: {}, stderr: {}".format( self.retcode(), stderr.decode()))
-
-        if not ssh_working:
-            if self.is_running():
-                self.log.error("VM running but SSH is not working")
-            self.terminate()
-            raise VMError("SSH is not working after {} seconds".format(self.timeout_s))
-        self.log.info("VM is online and SSH is up")
-
-    def is_running(self):
-        return self.qemu_process and self.qemu_process.poll() is None
-
-    def retcode(self):
-        if self.qemu_process:
-            return self.qemu_process.poll()
-        else:
-            raise RuntimeError('qemu process was not started')
-
-    def terminate(self):
-        if self.qemu_process:
-            logging.info("send term signal")
-            self.qemu_process.terminate()
-            time.sleep(3)
-            logging.info("send kill signal")
-            self.qemu_process.kill()
-            self.qemu_process.wait()
-            self.qemu_process = None
-        else:
-            logging.warn("VM.terminate: QEMU process not running")
-
-    def detach(self):
-        self._detach = True
-
-    def shutdown(self):
-        if self.qemu_process:
-            logging.info("Shutdown via ssh")
-            # ssh connection will be closed with an error
-            call(["ssh", "-o", "StrictHostKeyChecking=no", "-p", str(self.ssh_port), "qemu@localhost",
-            "sudo", "poweroff"])
-            ret = self.qemu_process.wait(timeout=90)
-            self.log.info("VM on port %s has shutdown (exit code %d)", self.ssh_port, ret)
-            self.qemu_process = None
-
-    def wait(self):
-        if self.qemu_process:
-            self.qemu_process.wait()
-
-    def __del__(self):
-        if self.is_running and not self._detach:
-            logging.info("VM destructor hit")
-            self.terminate()
-
-
-def qemu_ssh(ssh_port=QEMU_SSH_PORT, *args):
-    check_call(["ssh", "-o", "ServerAliveInterval=5", "-o", "StrictHostKeyChecking=no", "-p{}".format(ssh_port), "qemu@localhost", *args])
-
-
-def qemu_rsync(ssh_port, local_path, remote_path):
-    check_call(['rsync', '-e', 'ssh -o StrictHostKeyChecking=no -p{}'.format(ssh_port), '-a', local_path, 'qemu@localhost:{}'.format(remote_path)])
-
-def qemu_rsync_to_host(ssh_port, remote_path, local_path):
-    check_call(['rsync', '-e', 'ssh -o StrictHostKeyChecking=no -p{}'.format(ssh_port), '-va', 'qemu@localhost:{}'.format(remote_path), local_path])
-
-
-@retry(subprocess.CalledProcessError)
-def qemu_provision(ssh_port=QEMU_SSH_PORT):
-    import glob
-    logging.info("Provisioning the VM with artifacts and sources")
-
-    artifact = glob.glob('/work/mxnet/build/*.whl')
-    for x in artifact:
-        qemu_rsync(ssh_port, x, 'mxnet_dist/')
-    qemu_rsync(ssh_port, '/work/runtime_functions.py','')
-    qemu_rsync(ssh_port, '/work/vmcontrol.py','')
-    qemu_rsync(ssh_port, 'mxnet/tests', 'mxnet')
-    qemu_rsync(ssh_port, 'mxnet/ci/qemu/test_requirements.txt', 'mxnet/test_requirements.txt')
-    logging.info("Provisioning completed successfully.")
-
-
-def wait_ssh_open(server, port, keep_waiting=None, timeout=None):
-    """ Wait for network service to appear
-        @param server: host to connect to (str)
-        @param port: port (int)
-        @param timeout: in seconds, if None or 0 wait forever
-        @return: True of False, if timeout is None may return only True or
-                 throw unhandled network exception
-    """
-    import socket
-    import errno
-    import time
-    log = logging.getLogger('wait_ssh_open')
-    sleep_s = 1
-    if timeout:
-        from time import time as now
-        # time module is needed to calc timeout shared between two exceptions
-        end = now() + timeout
-
-    while True:
-        log.debug("Sleeping for %s second(s)", sleep_s)
-        time.sleep(sleep_s)
-        s = socket.socket()
-        try:
-            if keep_waiting and not keep_waiting():
-                log.debug("keep_waiting() is set and evaluates to False")
-                return False
-
-            if timeout:
-                next_timeout = end - now()
-                if next_timeout < 0:
-                    log.debug("connect time out")
-                    return False
-                else:
-                    log.debug("connect timeout %d s", next_timeout)
-                    s.settimeout(next_timeout)
-
-            log.debug("connect %s:%d", server, port)
-            s.connect((server, port))
-            ret = s.recv(1024).decode()
-            if ret and ret.startswith('SSH'):
-                s.close()
-                log.info("wait_ssh_open: port %s:%s is open and ssh is ready", server, port)
-                return True
-            else:
-                log.debug("Didn't get the SSH banner")
-                s.close()
-
-        except ConnectionError as err:
-            log.debug("ConnectionError %s", err)
-            if sleep_s == 0:
-                sleep_s = 1
-            else:
-                sleep_s *= 2
-
-        except socket.gaierror as err:
-            log.debug("gaierror %s",err)
-            return False
-
-        except socket.timeout as err:
-            # this exception occurs only if timeout is set
-            if timeout:
-                return False
-
-        except TimeoutError as err:
-            # catch timeout exception from underlying network library
-            # this one is different from socket.timeout
-            raise
-
-
-def wait_port_open(server, port, timeout=None):
-    """ Wait for network service to appear
-        @param server: host to connect to (str)
-        @param port: port (int)
-        @param timeout: in seconds, if None or 0 wait forever
-        @return: True of False, if timeout is None may return only True or
-                 throw unhandled network exception
-    """
-    import socket
-    import errno
-    import time
-    sleep_s = 0
-    if timeout:
-        from time import time as now
-        # time module is needed to calc timeout shared between two exceptions
-        end = now() + timeout
-
-    while True:
-        logging.debug("Sleeping for %s second(s)", sleep_s)
-        time.sleep(sleep_s)
-        s = socket.socket()
-        try:
-            if timeout:
-                next_timeout = end - now()
-                if next_timeout < 0:
-                    return False
-                else:
-                    s.settimeout(next_timeout)
-
-            logging.info("connect %s %d", server, port)
-            s.connect((server, port))
-
-        except ConnectionError as err:
-            logging.debug("ConnectionError %s", err)
-            if sleep_s == 0:
-                sleep_s = 1
-
-        except socket.gaierror as err:
-            logging.debug("gaierror %s",err)
-            return False
-
-        except socket.timeout as err:
-            # this exception occurs only if timeout is set
-            if timeout:
-                return False
-
-        except TimeoutError as err:
-            # catch timeout exception from underlying network library
-            # this one is different from socket.timeout
-            raise
-
-        else:
-            s.close()
-            logging.info("wait_port_open: port %s:%s is open", server, port)
-            return True
-
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 587a4088fd7d..9141a21b2457 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1402,6 +1402,18 @@ test_ubuntu_cpu_python3() {
     popd
 }
 
+# QEMU based ARM tests
+unittest_ubuntu_python3_arm() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    python3 -m nose --verbose tests/python/unittest/test_engine.py
+}
+
 # Functions that run the nightly Tests:
 
 #Runs Apache RAT Check on MXNet Source for License Headers
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 3f5fb2503b56..378ae8177ec5 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -26,9 +26,6 @@ utils = load('ci/Jenkinsfile_utils.groovy')
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_lib_cython = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
 
-// Python wheels
-mx_pip = 'build/*.whl'
-
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 mx_cmake_lib_no_tvm_op = 'build/libmxnet.so, build/libmxnet.a, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
@@ -460,27 +457,28 @@ def compile_armv8_jetson_gpu() {
     }]
 }
 
-def compile_armv7_cpu() {
-    return ['ARMv7':{
+def compile_armv6_cpu() {
+    return ['ARMv6':{
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv7') {
+        ws('workspace/build-ARMv6') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('armv7', 'build_armv7', false)
-            utils.pack_lib('armv7', mx_pip)
+            utils.docker_run('armv6', 'build_armv6', false)
+            utils.pack_lib('armv6', mx_lib)
           }
         }
       }
     }]
 }
 
-def compile_armv6_cpu() {
-    return ['ARMv6':{
+def compile_armv7_cpu() {
+    return ['ARMv7':{
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv6') {
+        ws('workspace/build-ARMv7') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('armv6', 'build_armv6', false)
+            utils.docker_run('armv7', 'build_armv7', false)
+            utils.pack_lib('armv7', mx_lib)
           }
         }
       }
@@ -494,6 +492,7 @@ def compile_armv8_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('armv8', 'build_armv8', false)
+            utils.pack_lib('armv8', mx_lib)
           }
         }
       }
@@ -1361,8 +1360,21 @@ def test_qemu_armv7_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-armv7-qemu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv7', mx_pip)
-            sh "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
+            utils.unpack_and_init('armv7', mx_lib)
+            utils.docker_run('test.armv7', 'unittest_ubuntu_python3_arm', false)
+          }
+        }
+      }
+    }]
+}
+
+def test_qemu_armv8_cpu() {
+    return ['ARMv8 QEMU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-armv8-qemu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('armv8', mx_lib)
+            utils.docker_run('test.armv8', 'unittest_ubuntu_python3_arm', false)
           }
         }
       }
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 9d8e01399d7c..9e2abf558dd2 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -40,11 +40,12 @@ core_logic: {
     custom_steps.compile_armv8_cpu(),
     custom_steps.compile_armv8_android_cpu(),
     custom_steps.compile_armv7_android_cpu()
-  ]) 
+  ])
 
   utils.parallel_stage('Tests', [
-    custom_steps.test_qemu_armv7_cpu()
-  ]) 
+    custom_steps.test_qemu_armv7_cpu(),
+    custom_steps.test_qemu_armv8_cpu()
+  ])
 }
 ,
 failure_handler: {
diff --git a/ci/qemu/README.md b/ci/qemu/README.md
deleted file mode 100644
index 4beca4a03690..000000000000
--- a/ci/qemu/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# QEMU base image creation
-
-This folder contains scripts and configuration to create a QEMU virtual drive with a debian system.
-
-The order of execution is:
-- `init.sh` to download the installation kernel and ramdisk
-- `preseed.sh` to preseed the debian installer so it doesn't ask questions 
-- `copy.sh` to extract the kernel and ramdisk from the installed system
-- `run.sh` to boot the system and fine tune the image
-
-# Description of the process:
-
-# Preparing the base image
-
-First, an installation is made using installer kernel and initrd by using the scripts above.
-
-# After installation, we extract initrd and kernel from the installation drive
-
-The commands look like this:
-
-`virt-copy-out -a hda.qcow2 /boot/initrd.img-4.15.0-30-generic-lpae .`
-
-In the same way for the kernel.
-
-Then we install packages and dependencies on the qemu image:
-
-apt install -y sudo python3-dev virtualenv wget libgfortran3 libopenblas-base rsync build-essential
-libopenblas-dev libomp5
-
-We enable sudo and passwordless logins:
-
-Add file `/etc/sudoers.d/01-qemu`
-With content:
-```
-qemu ALL=(ALL) NOPASSWD: ALL
-```
-
-Edit: `/etc/ssh/sshd_config`
-
-And set the following options:
-```
-PermitEmptyPasswords yes
-PasswordAuthentication yes
-PermitRootLogin yes
-```
-
-Disable root and user passwords with `passwd -d`
-
-Edit ` /etc/pam.d/common-auth`
-
-Replace `auth    [success=1 default=ignore]      pam_unix.so nullok_secure` by 
-```
-auth    [success=1 default=ignore]      pam_unix.so nullok
-```
-
-As root to install system wide:
-
-```
-wget -nv https://bootstrap.pypa.io/get-pip.py
-python3 get-pip.py
-apt-get clean
-```
-
-Afterwards install mxnet python3 deps:
-
-```
-pip3 install -r mxnet_requirements.txt
-```
-
-
-To access qemu control console from tmux: `ctrl-a a c`
-
-# CI and Testing
-
-Formally, [runtime_functions.py](https://github.com/apache/incubator-mxnet/blob/master/ci/docker/qemu/runtime_functions.py) would [run](https://github.com/apache/incubator-mxnet/blob/8beea18e3d9835f90b59d3f9de8f9945ac819423/ci/docker/qemu/runtime_functions.py#L81) *pip install -r [mxnet/tests/requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/tests/requirements.txt)*. If the requirements change, there can be an unfortunate side-effect that there are no wheel files for Raspberry Pi for the new requirement. This would trigger a build from source on the emulator, which can take a long time and cause job timeouts. Therefore, we no longer install the `tests/requirements.txt` requirements, but rather rely on [test_requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/ci/qemu/test_requirements.txt) to maintain the requirements for the qemu tests. Should any requirements changes lead to a job time out, it is incumbent on the submitter to update the image to include the requirement and unblock ci.
diff --git a/ci/qemu/copy.sh b/ci/qemu/copy.sh
deleted file mode 100755
index f39a9d083509..000000000000
--- a/ci/qemu/copy.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Extract kernel from image
-
-set -ex
-virt-copy-out -a vda.qcow2 /boot/vmlinuz-3.16.0-6-armmp-lpae /boot/initrd.img-3.16.0-6-armmp-lpae .
diff --git a/ci/qemu/init.sh b/ci/qemu/init.sh
deleted file mode 100755
index 1698cb10f272..000000000000
--- a/ci/qemu/init.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Download the installer and ramdisk for intallation
-set -ex
-wget -O installer-vmlinuz http://http.us.debian.org/debian/dists/jessie/main/installer-armhf/current/images/netboot/vmlinuz
-wget -O installer-initrd.gz http://http.us.debian.org/debian/dists/jessie/main/installer-armhf/current/images/netboot/initrd.gz
diff --git a/ci/qemu/initrd_modif/inittab b/ci/qemu/initrd_modif/inittab
deleted file mode 100644
index 064512595fbc..000000000000
--- a/ci/qemu/initrd_modif/inittab
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# /etc/inittab
-# busybox init configuration for debian-installer
-
-# main rc script
-::sysinit:/sbin/reopen-console /sbin/debian-installer-startup
-
-# main setup program
-::respawn:/sbin/reopen-console /sbin/debian-installer
-
-# convenience shells
-tty2::askfirst:-/bin/sh
-tty3::askfirst:-/bin/sh
-
-# logging
-#tty4::respawn:/usr/bin/tail -f /var/log/syslog
-
-# Stuff to do before rebooting
-::ctrlaltdel:/sbin/shutdown > /dev/null 2>&1
-
-# re-exec init on receipt of SIGHUP/SIGUSR1
-::restart:/sbin/init
diff --git a/ci/qemu/install.sh b/ci/qemu/install.sh
deleted file mode 100755
index 8531b033d074..000000000000
--- a/ci/qemu/install.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-rm -f vda.qcow2
-sudo ./preseed.sh
-qemu-img create -f qcow2 vda.qcow2 10G
-qemu-system-arm -M virt -m 1024 \
-  -kernel installer-vmlinuz \
-  -append BOOT_DEBUG=2,DEBIAN_FRONTEND=noninteractive \
-  -initrd installer-initrd_automated.gz \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet \
-  -device virtio-net-device,netdev=mynet \
-  -nographic -no-reboot
diff --git a/ci/qemu/mxnet_requirements.txt b/ci/qemu/mxnet_requirements.txt
deleted file mode 100644
index 2ab0fd9612e5..000000000000
--- a/ci/qemu/mxnet_requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-urllib3<1.23,>=1.21.1
-requests<2.19.0,>=2.18.4
-graphviz<0.9.0,>=0.8.1
-numpy>1.16.0,<2.0.0
-mock
-nose
-nose-timer
diff --git a/ci/qemu/preseed.cfg b/ci/qemu/preseed.cfg
deleted file mode 100644
index 23a8fc3baebf..000000000000
--- a/ci/qemu/preseed.cfg
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-d-i debian-installer/locale string en_US
-d-i keyboard-configuration/xkb-keymap select us
-d-i netcfg/get_hostname string debian-qemu
-d-i netcfg/get_domain string lab
-d-i passwd/root-login boolean true
-d-i passwd/root-password password debian
-d-i passwd/root-password-again password debian
-d-i clock-setup/utc boolean true
-d-i	mirror/country	string	US
-d-i	mirror/https/proxy	string
-d-i	mirror/http/proxy	string
-d-i	mirror/ftp/proxy	string
-d-i	mirror/http/countries	select	US
-d-i	mirror/http/hostname	string	ftp.us.debian.org
-d-i	mirror/http/mirror	select	ftp.us.debian.org
-d-i	localechooser/preferred-locale	select	en_US.UTF-8
-apt-mirror-setup	apt-setup/use_mirror	boolean	false
-apt-mirror-setup	apt-setup/mirror/error	select	Retry
-d-i passwd/username string qemu
-d-i passwd/user-password password qemu
-d-i passwd/user-password-again password qemu
-user-setup-udeb	passwd/username	string	qemu
-user-setup-udeb	passwd/user-fullname	string qemu
-d-i time/zone string GMT
-d-i partman-auto/choose_recipe select atomic
-#partman-auto	partman-auto/select_disk	select	/var/lib/partman/devices/=dev=vda
-#partman-auto	partman-auto/automatically_partition	select
-#partman-target	partman-target/no_root	error	
-#partman-auto	partman-auto/init_automatically_partition	select	50some_device__________regular
-#partman-auto	partman-auto/disk	string vda
-#partman-auto partman-auto/expert_recipe string                \
-#      boot-root ::                                            \
-#		100 10000 1000000000 ext4                             \
-#				$primary{ }                                   \
-#                lv_name{ root }                               \
-#				method{ format }                              \
-#				format{ }                                     \
-#				use_filesystem{ }                             \
-#				filesystem{ ext4 }                            \
-#				mountpoint{ / } .
-#
-#d-i partman-partitioning/confirm_write_new_label boolean true
-#d-i partman/choose_partition select finish
-#d-i partman/confirm boolean true
-#d-i partman/confirm_nooverwrite boolean true
-#partman-base	partman/choose_partition	select	90finish__________finish
-#partman-basicfilesystems	partman-basicfilesystems/swap_check_failed	boolean
-d-i	popularity-contest/participate	boolean	false
-d-i	tasksel/first	multiselect	SSH server, standard system utilities
-d-i	debian-installer/main-menu	select	Finish the installation
-d-i debian-installer/exit/poweroff boolean true
diff --git a/ci/qemu/preseed.sh b/ci/qemu/preseed.sh
deleted file mode 100755
index ad005548fbbe..000000000000
--- a/ci/qemu/preseed.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-rm -rf initrd
-mkdir -p initrd
-cd initrd
-gunzip -c ../installer-initrd.gz | cpio -i
-cp ../preseed.cfg .
-cp ../initrd_modif/inittab etc/inittab
-cp ../initrd_modif/S10syslog lib/debian-installer-startup.d/S10syslog
-find .  | cpio --create --format 'newc'  | gzip -c > ../installer-initrd_automated.gz
-echo "Done!"
diff --git a/ci/qemu/run.sh b/ci/qemu/run.sh
deleted file mode 100755
index eeff4e1fdccb..000000000000
--- a/ci/qemu/run.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
- 
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-disk=${1:-vda.qcow2}
-qemu-system-arm -M virt -m 1024 \
-  -kernel vmlinuz-3.16.0-6-armmp-lpae \
-  -initrd initrd.img-3.16.0-6-armmp-lpae \
-  -smp 4 \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=$disk,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::2222-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -nographic
-#  -display none
diff --git a/ci/qemu/test_requirements.txt b/ci/qemu/test_requirements.txt
deleted file mode 100644
index 77037d89c673..000000000000
--- a/ci/qemu/test_requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-mock
-nose
-nose-timer
\ No newline at end of file

From ea2b8d51574ab599bf219a91a4059df53fc6381a Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Fri, 22 May 2020 17:44:59 +0300
Subject: [PATCH 07/14] Request the C++ standard library and extensions

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e630730115a2..92f93e7bb15e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@ if(CMAKE_CROSSCOMPILING)
 endif()
 
 project(mxnet C CXX)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
   # Load config.cmake only if mxnet is not compiled as a dependency of another project

From 0b1355ffb746b93a8171c10fcca99f56bc24434e Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Fri, 22 May 2020 19:32:53 +0300
Subject: [PATCH 08/14] Upgrade dmlc-core to resolve build errors

---
 3rdparty/dmlc-core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index b3a4c715bfc3..5df8305fe699 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit b3a4c715bfc37a08f245844a800933f10e47c1ea
+Subproject commit 5df8305fe699d3b503d10c60a231ab0223142407

From 567518bc145f7b76c454f3adf2cacee317511be9 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Fri, 22 May 2020 19:53:47 +0300
Subject: [PATCH 09/14] Remove leftovers from C++17 dev type check

---
 src/operator/rnn-inl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 180e945000e4..ede1d5f4717f 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -843,7 +843,6 @@ class RNNOp {
     }
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
-#if !defined(__CUDACC__)  // cuda doesn't support C++17
     if (ctx_.dev_type == kCPU) {
       int projection_size = 0;
       if (param_.projection_size.has_value()) {
@@ -922,7 +921,6 @@ class RNNOp {
                                    param_.mode);
       }
     }
-#endif
   }
 
   void Backward(const OpContext &ctx,

From 0a921a4432ac657b47be44af86ab498fee66f964 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Fri, 22 May 2020 23:28:31 +0300
Subject: [PATCH 10/14] Fix CPU-only RRNOp Forward

---
 src/operator/rnn-inl.h | 111 ++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 56 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ede1d5f4717f..fdce937e50d1 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -842,46 +842,65 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN_GE_7200
     }
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
+#if !defined(__CUDACC__)
+    int projection_size = 0;
+    if (param_.projection_size.has_value()) {
+      projection_size = param_.projection_size.value();
+    }
 
-    if (ctx_.dev_type == kCPU) {
-      int projection_size = 0;
+    // allocate temp space
+    const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+        param_.state_size, projection_size, direction, param_.mode);
+    if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
+      temp_cpu_space_size_ = work_cpu_space_size;
+      temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
+          false, in_data[rnn_enum::kData].type_flag_);
+      temp_init_space_ = true;
+    }
+    DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
+
+    if (ctx.is_train || ctx.need_grad) {
+      mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
+      std::mt19937 &rnd_engine = prnd->GetRndEngine();
+
+      // allocate reserve space
       if (param_.projection_size.has_value()) {
-        projection_size = param_.projection_size.value();
+        LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
       }
 
-      // allocate temp space
-      const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-          param_.state_size, projection_size, direction, param_.mode);
-      if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
-        temp_cpu_space_size_ = work_cpu_space_size;
-        temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
+      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                    param_.seq_length_, param_.batch_size_,
+                                                    param_.state_size, param_.mode);
+      if (!init_space_ || reserve_cpu_space_size_ < r_size) {
+        reserve_cpu_space_size_ = r_size;
+        reserve_cpu_space_ = NDArray(TShape({static_cast<dim_t>(reserve_cpu_space_size_)}), ctx_,
             false, in_data[rnn_enum::kData].type_flag_);
-        temp_init_space_ = true;
+        init_space_ = true;
       }
-      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
-
-      if (ctx.is_train || ctx.need_grad) {
-        mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
-        std::mt19937 &rnd_engine = prnd->GetRndEngine();
-
-        // allocate reserve space
-        if (param_.projection_size.has_value()) {
-          LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
-        }
-
-        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                     param_.seq_length_, param_.batch_size_,
-                                                     param_.state_size, param_.mode);
-        if (!init_space_ || reserve_cpu_space_size_ < r_size) {
-          reserve_cpu_space_size_ = r_size;
-          reserve_cpu_space_ = NDArray(TShape({static_cast<dim_t>(reserve_cpu_space_size_)}), ctx_,
-              false, in_data[rnn_enum::kData].type_flag_);
-          init_space_ = true;
-        }
-        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
 
-        RNNForwardTraining<DType>(work_cpu_space,
-                                  reserve_space_ptr,
+      RNNForwardTraining<DType>(work_cpu_space,
+                                reserve_space_ptr,
+                                param_.state_outputs,
+                                param_.num_layers,
+                                direction,
+                                param_.seq_length_,
+                                param_.batch_size_,
+                                param_.input_size_,
+                                param_.state_size,
+                                x.dptr_,
+                                hx.dptr_,
+                                cx_ptr,
+                                w.dptr_,
+                                b_ptr,
+                                y.dptr_,
+                                hy_ptr,
+                                cy_ptr,
+                                param_.p,
+                                param_.mode,
+                                rnd_engine);
+    } else {
+      RNNForwardInference<DType>(work_cpu_space,
                                   param_.state_outputs,
                                   param_.num_layers,
                                   direction,
@@ -889,6 +908,7 @@ class RNNOp {
                                   param_.batch_size_,
                                   param_.input_size_,
                                   param_.state_size,
+                                  projection_size,
                                   x.dptr_,
                                   hx.dptr_,
                                   cx_ptr,
@@ -897,30 +917,9 @@ class RNNOp {
                                   y.dptr_,
                                   hy_ptr,
                                   cy_ptr,
-                                  param_.p,
-                                  param_.mode,
-                                  rnd_engine);
-      } else {
-        RNNForwardInference<DType>(work_cpu_space,
-                                   param_.state_outputs,
-                                   param_.num_layers,
-                                   direction,
-                                   param_.seq_length_,
-                                   param_.batch_size_,
-                                   param_.input_size_,
-                                   param_.state_size,
-                                   projection_size,
-                                   x.dptr_,
-                                   hx.dptr_,
-                                   cx_ptr,
-                                   w.dptr_,
-                                   b_ptr,
-                                   y.dptr_,
-                                   hy_ptr,
-                                   cy_ptr,
-                                   param_.mode);
-      }
+                                  param_.mode);
     }
+#endif  // !defined(__CUDACC__)
   }
 
   void Backward(const OpContext &ctx,

From a5faf8cf02703c873993c3dc77e8f609477105a5 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Sat, 23 May 2020 02:15:09 +0300
Subject: [PATCH 11/14] Change the ARM8 build to work like the ARM7 build

---
 ci/docker/runtime_functions.sh | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 9141a21b2457..4b544e4b63ef 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -331,20 +331,32 @@ build_armv7() {
 }
 
 build_armv8() {
+    set -ex
+    pushd .
+    cd /work/build
+
+    # Lapack functionality will be included and statically linked to openblas.
+    # But USE_LAPACK needs to be set to OFF, otherwise the main CMakeLists.txt
+    # file tries to add -llapack. Lapack functionality though, requires -lgfortran
+    # to be linked additionally.
+
     build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DCMAKE_CROSSCOMPILING=ON \
         -DUSE_CUDA=OFF \
-        -DSUPPORT_F16C=OFF \
         -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
-        -DUSE_LAPACK=OFF \
         -DUSE_SIGNAL_HANDLER=ON \
         -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DBUILD_CPP_EXAMPLES=OFF \
         -G Ninja /work/mxnet
+
     ninja
     build_wheel
+    popd
 }
 
 

From 36c78b60132c530dff82a0d89c8f746dcfa95846 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Sat, 23 May 2020 19:28:05 +0300
Subject: [PATCH 12/14] Revert "Fix CPU-only RRNOp Forward"

This reverts commit 0a921a4432ac657b47be44af86ab498fee66f964.
---
 src/operator/rnn-inl.h | 111 +++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index fdce937e50d1..ede1d5f4717f 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -842,65 +842,46 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN_GE_7200
     }
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
-#if !defined(__CUDACC__)
-    int projection_size = 0;
-    if (param_.projection_size.has_value()) {
-      projection_size = param_.projection_size.value();
-    }
-
-    // allocate temp space
-    const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-        param_.state_size, projection_size, direction, param_.mode);
-    if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
-      temp_cpu_space_size_ = work_cpu_space_size;
-      temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
-          false, in_data[rnn_enum::kData].type_flag_);
-      temp_init_space_ = true;
-    }
-    DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
 
-    if (ctx.is_train || ctx.need_grad) {
-      mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
-      std::mt19937 &rnd_engine = prnd->GetRndEngine();
-
-      // allocate reserve space
+    if (ctx_.dev_type == kCPU) {
+      int projection_size = 0;
       if (param_.projection_size.has_value()) {
-        LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
+        projection_size = param_.projection_size.value();
       }
 
-      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                    param_.seq_length_, param_.batch_size_,
-                                                    param_.state_size, param_.mode);
-      if (!init_space_ || reserve_cpu_space_size_ < r_size) {
-        reserve_cpu_space_size_ = r_size;
-        reserve_cpu_space_ = NDArray(TShape({static_cast<dim_t>(reserve_cpu_space_size_)}), ctx_,
+      // allocate temp space
+      const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+          param_.state_size, projection_size, direction, param_.mode);
+      if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
+        temp_cpu_space_size_ = work_cpu_space_size;
+        temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
             false, in_data[rnn_enum::kData].type_flag_);
-        init_space_ = true;
+        temp_init_space_ = true;
       }
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
 
-      RNNForwardTraining<DType>(work_cpu_space,
-                                reserve_space_ptr,
-                                param_.state_outputs,
-                                param_.num_layers,
-                                direction,
-                                param_.seq_length_,
-                                param_.batch_size_,
-                                param_.input_size_,
-                                param_.state_size,
-                                x.dptr_,
-                                hx.dptr_,
-                                cx_ptr,
-                                w.dptr_,
-                                b_ptr,
-                                y.dptr_,
-                                hy_ptr,
-                                cy_ptr,
-                                param_.p,
-                                param_.mode,
-                                rnd_engine);
-    } else {
-      RNNForwardInference<DType>(work_cpu_space,
+      if (ctx.is_train || ctx.need_grad) {
+        mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
+        std::mt19937 &rnd_engine = prnd->GetRndEngine();
+
+        // allocate reserve space
+        if (param_.projection_size.has_value()) {
+          LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
+        }
+
+        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                     param_.seq_length_, param_.batch_size_,
+                                                     param_.state_size, param_.mode);
+        if (!init_space_ || reserve_cpu_space_size_ < r_size) {
+          reserve_cpu_space_size_ = r_size;
+          reserve_cpu_space_ = NDArray(TShape({static_cast<dim_t>(reserve_cpu_space_size_)}), ctx_,
+              false, in_data[rnn_enum::kData].type_flag_);
+          init_space_ = true;
+        }
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
+
+        RNNForwardTraining<DType>(work_cpu_space,
+                                  reserve_space_ptr,
                                   param_.state_outputs,
                                   param_.num_layers,
                                   direction,
@@ -908,7 +889,6 @@ class RNNOp {
                                   param_.batch_size_,
                                   param_.input_size_,
                                   param_.state_size,
-                                  projection_size,
                                   x.dptr_,
                                   hx.dptr_,
                                   cx_ptr,
@@ -917,9 +897,30 @@ class RNNOp {
                                   y.dptr_,
                                   hy_ptr,
                                   cy_ptr,
-                                  param_.mode);
+                                  param_.p,
+                                  param_.mode,
+                                  rnd_engine);
+      } else {
+        RNNForwardInference<DType>(work_cpu_space,
+                                   param_.state_outputs,
+                                   param_.num_layers,
+                                   direction,
+                                   param_.seq_length_,
+                                   param_.batch_size_,
+                                   param_.input_size_,
+                                   param_.state_size,
+                                   projection_size,
+                                   x.dptr_,
+                                   hx.dptr_,
+                                   cx_ptr,
+                                   w.dptr_,
+                                   b_ptr,
+                                   y.dptr_,
+                                   hy_ptr,
+                                   cy_ptr,
+                                   param_.mode);
+      }
     }
-#endif  // !defined(__CUDACC__)
   }
 
   void Backward(const OpContext &ctx,

From 8ae6298bd89d3f0ca7e09d009fe38705e06f3ad8 Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Sat, 23 May 2020 20:20:33 +0300
Subject: [PATCH 13/14] Hack around the lack of constexpr if

---
 src/operator/rnn-inl.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ede1d5f4717f..5eca5c6c2ecd 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -861,9 +861,13 @@ class RNNOp {
       DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
 
       if (ctx.is_train || ctx.need_grad) {
-        mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
-        std::mt19937 &rnd_engine = prnd->GetRndEngine();
-
+        mshadow::Random<xpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
+        // Hack: the surrounding if condition would be a constexpr if in C++17.
+        // Since this branch can only be reached if the xpu == cpu, the cast is valid.
+        // Using macros with defined(__CUDACC__) instead of the if statement results in errors
+        // related to unused variables which are declared above.
+        auto cpu_prnd = reinterpret_cast<mshadow::Random<cpu, unsigned> *>(prnd);
+        std::mt19937 &rnd_engine = cpu_prnd->GetRndEngine();
         // allocate reserve space
         if (param_.projection_size.has_value()) {
           LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";

From 53147e0a3493cb235269de32809afdbab1df561f Mon Sep 17 00:00:00 2001
From: Nick Guletskii <nick@nickguletskii.com>
Date: Sat, 23 May 2020 22:07:04 +0300
Subject: [PATCH 14/14] Adjust the list of files to be packed in ARM jobs

---
 ci/jenkins/Jenkins_steps.groovy | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 378ae8177ec5..98c774b284ec 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -464,7 +464,7 @@ def compile_armv6_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('armv6', 'build_armv6', false)
-            utils.pack_lib('armv6', mx_lib)
+            utils.pack_lib('armv6', mx_cmake_lib)
           }
         }
       }
@@ -478,7 +478,7 @@ def compile_armv7_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('armv7', 'build_armv7', false)
-            utils.pack_lib('armv7', mx_lib)
+            utils.pack_lib('armv7', mx_cmake_lib)
           }
         }
       }
@@ -492,7 +492,7 @@ def compile_armv8_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('armv8', 'build_armv8', false)
-            utils.pack_lib('armv8', mx_lib)
+            utils.pack_lib('armv8', mx_cmake_lib)
           }
         }
       }
@@ -1360,7 +1360,7 @@ def test_qemu_armv7_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-armv7-qemu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv7', mx_lib)
+            utils.unpack_and_init('armv7', mx_cmake_lib)
             utils.docker_run('test.armv7', 'unittest_ubuntu_python3_arm', false)
           }
         }
@@ -1373,7 +1373,7 @@ def test_qemu_armv8_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-armv8-qemu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv8', mx_lib)
+            utils.unpack_and_init('armv8', mx_cmake_lib)
             utils.docker_run('test.armv8', 'unittest_ubuntu_python3_arm', false)
           }
         }