diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index e7aeae935d06..8ecc7e193b97 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -255,6 +255,7 @@ def assign_node_labels(args) {
   //    knowing about the limitations.
   NODE_LINUX_CPU = args.linux_cpu
   NODE_LINUX_GPU = args.linux_gpu
+  NODE_LINUX_GPU_G4 = args.linux_gpu_g4
   NODE_LINUX_GPU_P3 = args.linux_gpu_p3
   NODE_WINDOWS_CPU = args.windows_cpu
   NODE_WINDOWS_GPU = args.windows_gpu
diff --git a/ci/build.py b/ci/build.py
index cbc41218f042..18ad57fbd87c 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -196,8 +196,9 @@ def container_run(docker_client: SafeDockerClient,
 
     # Equivalent command
     docker_cmd_list = [
-        get_docker_binary(nvidia_runtime),
+        "docker",
         'run',
+        "--gpus all" if nvidia_runtime else "",
         "--cap-add",
         "SYS_PTRACE", # Required by ASAN
         '--rm',
diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
new file mode 100644
index 000000000000..d5933c3bb628
--- /dev/null
+++ b/ci/docker/Dockerfile.build.ubuntu
@@ -0,0 +1,166 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile for Ubuntu based builds.
+#
+# See docker-compose.yml for supported BASE_IMAGE ARGs and targets.
+
+####################################################################################################
+# The Dockerfile uses a dynamic BASE_IMAGE (for example ubuntu:18.04
+# nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 etc).
+# On top of BASE_IMAGE we install all dependencies shared by all MXNet build
+# environments into a "base" target. At the end of this file, we can specialize
+# "base" for specific usecases. The target built by docker can be selected via
+# "--target" option or docker-compose.yml
+####################################################################################################
+ARG BASE_IMAGE
+FROM $BASE_IMAGE AS base
+
+WORKDIR /work/deps
+
+RUN export DEBIAN_FRONTEND=noninteractive && \
+    apt-get update && \
+    apt-get install -y wget software-properties-common && \
+    wget -qO - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
+    wget -qO - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | apt-key add - && \
+    apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" &&  \
+    apt-add-repository "deb https://apt.repos.intel.com/mkl all main" &&  \
+    apt-get update && \
+    apt-get install -y \
+        ## Utilities
+        curl \
+        unzip \
+        pandoc \
+        ## Development tools
+        build-essential \
+        ninja-build \
+        git \
+        protobuf-compiler \
+        libprotobuf-dev \
+        clang-6.0 \
+        clang-tidy-6.0 \
+        python-yaml \
+        clang-10 \
+        g++ \
+        g++-8 \
+        intel-mkl-2020.0-088 \
+        ## Dependencies
+        libgomp1 \
+        libturbojpeg0-dev \
+        libopenblas-dev \
+        libcurl4-openssl-dev \
+        libatlas-base-dev \
+        libzmq3-dev \
+        liblapack-dev \
+        libopencv-dev \
+        # Caffe
+        caffe-cpu \
+        libcaffe-cpu-dev \
+        ## Frontend languages
+        # Python
+        python3 \
+        python3-pip \
+        python3-nose \
+        python3-nose-timer \
+        # Scala
+        openjdk-8-jdk \
+        openjdk-8-jre \
+        maven \
+        scala \
+        # Clojure
+        clojure \
+        leiningen \
+        # R
+        r-base-core \
+        r-cran-devtools \
+        libcairo2-dev \
+        libxml2-dev \
+        ## Documentation
+        doxygen \
+        pandoc \
+        ## Build-dependencies for ccache 3.7.9
+        gperf \
+        libb2-dev \
+        libzstd-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# ccache 3.7.9 has fixes for caching nvcc outputs
+RUN cd /usr/local/src && \
+    git clone --recursive https://github.com/ccache/ccache.git && \
+    cd ccache && \
+    git checkout v3.7.9 && \
+    ./autogen.sh && \
+    ./configure --disable-man && \
+    make -j$(nproc) && \
+    make install && \
+    cd /usr/local/src && \
+    rm -rf ccache
+
+# Python & cmake
+COPY install/requirements /work/
+RUN python3 -m pip install cmake==3.16.6 && \
+    python3 -m pip install -r /work/requirements
+
+# Only OpenJDK 8 supported at this time..
+RUN update-java-alternatives -s java-1.8.0-openjdk-amd64
+
+# julia not available on 18.04
+COPY install/ubuntu_julia.sh /work/
+RUN /work/ubuntu_julia.sh
+
+# PDL::CCS missing on 18.04
+COPY install/ubuntu_perl.sh /work/
+RUN /work/ubuntu_perl.sh
+
+# MXNetJS nightly needs emscripten for wasm
+COPY install/ubuntu_emscripten.sh /work/
+RUN /work/ubuntu_emscripten.sh
+
+ARG USER_ID=0
+COPY install/docker_filepermissions.sh /work/
+RUN /work/docker_filepermissions.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
+
+####################################################################################################
+# Specialize base image to install more gpu specific dependencies.
+# The target built by docker can be selected via "--target" option or docker-compose.yml
+####################################################################################################
+FROM base as gpu
+# Install Thrust 1.9.8 to be shipped with Cuda 11.
+# Fixes https://github.com/thrust/thrust/issues/1072 for Clang 10
+# This file can be deleted when using Cuda 11 on CI
+RUN cd /usr/local && \
+    git clone https://github.com/thrust/thrust.git && \
+    cd thrust && \
+    git checkout 1.9.8
+
+
+FROM gpu as gpuwithcudaruntimelibs
+# Special case because the CPP-Package requires the CUDA runtime libs
+# and not only stubs (which are provided by the base image)
+# This prevents usage of this image for actual GPU tests with Docker.
+# This is a bug in CPP-Package and should be fixed.
+RUN export DEBIAN_FRONTEND=noninteractive && \
+    apt-get update && \
+    apt install -y  --no-install-recommends \
+        cuda-10-1 && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/ci/docker/docker-compose.yml b/ci/docker/docker-compose.yml
new file mode 100644
index 000000000000..ca00f9ff86bf
--- /dev/null
+++ b/ci/docker/docker-compose.yml
@@ -0,0 +1,208 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# We use the cache_from feature introduced in file form version 3.4 (released 2017-11-01)
+version: "3.4"
+
+# For simplicity, only the centos7_cpu is commented. But the comments apply to
+# all other services as well.
+services:
+  ###################################################################################################
+  # Dockerfile.build.centos7 based images used for building on CentOS7. On
+  # CentOS7, we respectively test the oldest supported toolchain and dependency
+  # versions
+  ###################################################################################################
+  centos7_cpu:
+    # The resulting image will be named build.centos7_cpu:latest and will be
+    # pushed to the dockerhub user specified in the environment variable
+    # ${DOCKER_CACHE_REGISTRY} (typicall "mxnetci") under this name
+    image: ${DOCKER_CACHE_REGISTRY}/build.centos7_cpu:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.centos7
+      # Use "base" target declared in Dockerfile.build.centos7 as "build.centos7_cpu:latest"  
+      target: base
+      args:
+        # BASE_IMAGE is used to dynamically specify the FROM image in Dockerfile.build.centos7
+        BASE_IMAGE: centos:7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.centos7_cpu:latest
+  centos7_gpu_cu92:
+    image: ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu92:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.centos7
+      target: gpu
+      args:
+        BASE_IMAGE: nvidia/cuda:9.2-cudnn7-devel-centos7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu92:latest
+  centos7_gpu_cu100:
+    image: ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu100:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.centos7
+      target: gpu
+      args:
+        BASE_IMAGE: nvidia/cuda:10.0-cudnn7-devel-centos7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu100:latest
+  centos7_gpu_cu101:
+    image: ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu101:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.centos7
+      target: gpu
+      args:
+        BASE_IMAGE: nvidia/cuda:10.1-cudnn7-devel-centos7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu101:latest
+  centos7_gpu_cu102:
+    image: ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu102:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.centos7
+      target: gpu
+      args:
+        BASE_IMAGE: nvidia/cuda:10.2-cudnn7-devel-centos7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu102:latest
+  ###################################################################################################
+  # Dockerfile.build.ubuntu based images. On Ubuntu we test more recent
+  # toolchain and dependency versions compared to CentOS7. We attempt to update
+  # the Ubuntu base image every 6 months, following the Ubuntu release cycle,
+  # and testing the dependencies in their version provided by the respective
+  # Ubuntu release.
+  ###################################################################################################
+  ubuntu_cpu:
+    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_cpu:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.ubuntu
+      target: base
+      args:
+        BASE_IMAGE: ubuntu:18.04
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_cpu:latest
+  ubuntu_gpu_cu101:
+    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu101:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.ubuntu
+      target: gpu
+      args:
+        BASE_IMAGE: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu101:latest
+  ubuntu_build_cuda:
+    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_build_cuda:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.ubuntu
+      target: gpuwithcudaruntimelibs
+      args:
+        BASE_IMAGE: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_build_cuda:latest
+  ###################################################################################################
+  # Dockerfile.build.android based images used for testing cross-compilation for plain ARM
+  ###################################################################################################
+  armv6:
+    image: ${DOCKER_CACHE_REGISTRY}/build.armv6:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.arm
+      target: armv6
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.armv6:latest
+  armv7:
+    image: ${DOCKER_CACHE_REGISTRY}/build.armv7:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.arm
+      target: armv7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.armv7:latest
+  armv8:
+    image: ${DOCKER_CACHE_REGISTRY}/build.armv8:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.arm
+      target: armv8
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.armv8:latest
+  ###################################################################################################
+  # Dockerfile.test.arm based images for testing ARM artefacts via QEMU
+  ###################################################################################################
+  test.armv7:
+    image: ${DOCKER_CACHE_REGISTRY}/test.armv7:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.test.arm
+      args:
+        BASE_IMAGE: arm32v7/ubuntu:20.04
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/test.armv7:latest
+  test.armv8:
+    image: ${DOCKER_CACHE_REGISTRY}/test.armv8:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.test.arm
+      args:
+        BASE_IMAGE: arm64v8/ubuntu:20.04
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/test.armv8:latest
+  ###################################################################################################
+  # Dockerfile.build.android based images used for testing cross-compilation for Android
+  ###################################################################################################
+  android_armv7:
+    image: ${DOCKER_CACHE_REGISTRY}/build.android_armv7:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.android
+      target: armv7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.android_armv7:latest
+  android_armv8:
+    image: ${DOCKER_CACHE_REGISTRY}/build.android_armv8:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.build.android
+      target: armv8
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/build.android_armv8:latest
+  ###################################################################################################
+  # Dockerfile.publish.test based images used for testing binary artifacts on minimal systems.
+  ###################################################################################################
+  publish.test.centos7_cpu:
+    image: ${DOCKER_CACHE_REGISTRY}/publish.test.centos7_cpu:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.publish.test.centos7
+      args:
+        BASE_IMAGE: centos:7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/publish.test.centos7_cpu:latest
+  publish.test.centos7_gpu:
+    image: ${DOCKER_CACHE_REGISTRY}/publish.test.centos7_gpu:latest
+    build:
+      context: .
+      dockerfile: Dockerfile.publish.test.centos7
+      args:
+        BASE_IMAGE: nvidia/cuda:9.2-cudnn7-devel-centos7
+      cache_from:
+        - ${DOCKER_CACHE_REGISTRY}/publish.test.centos7_gpu:latest
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 98c774b284ec..c34223bef288 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -143,7 +143,7 @@ def compile_unix_int64_cpu() {
 
 def compile_unix_int64_gpu() {
     return ['GPU: USE_INT64_TENSOR_SIZE': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/build-gpu-int64') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
@@ -750,7 +750,7 @@ def test_unix_python3_mkl_cpu() {
 
 def test_unix_python3_gpu() {
     return ['Python3: GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-python3-gpu') {
           try {
             utils.unpack_and_init('gpu', mx_lib_cython)
@@ -866,7 +866,7 @@ def test_unix_python3_mkldnn_mkl_cpu() {
 
 def test_unix_python3_mkldnn_gpu() {
     return ['Python3: MKLDNN-GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-python3-mkldnn-gpu') {
           try {
             utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib)
@@ -882,7 +882,7 @@ def test_unix_python3_mkldnn_gpu() {
 
 def test_unix_python3_mkldnn_nocudnn_gpu() {
     return ['Python3: MKLDNN-GPU-NOCUDNN': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
           try {
             utils.unpack_and_init('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
@@ -916,7 +916,7 @@ def test_unix_python3_tensorrt_gpu() {
 
 def test_unix_python3_integration_gpu() {
     return ['Python Integration GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/it-python-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib)
@@ -928,24 +928,9 @@ def test_unix_python3_integration_gpu() {
     }]
 }
 
-def test_unix_caffe_gpu() {
-    return ['Caffe GPU': {
-        node(NODE_LINUX_GPU) {
-            ws('workspace/it-caffe') {
-            timeout(time: max_time, unit: 'MINUTES') {
-                utils.init_git()
-                utils.unpack_lib('gpu', mx_lib)
-                utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_caffe', true)
-                utils.publish_test_coverage()
-            }
-            }
-        }
-    }]
-}
-
 def test_unix_cpp_package_gpu() {
-    return ['cpp-package GPU': {
-      node(NODE_LINUX_GPU) {
+    return ['cpp-package GPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/it-cpp-package') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib_cpp_examples)
@@ -958,8 +943,8 @@ def test_unix_cpp_package_gpu() {
 }
 
 def test_unix_capi_cpp_package() {
-    return ['capi-cpp-package GPU': {
-      node(NODE_LINUX_GPU) {
+    return ['capi-cpp-package GPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/it-capi-cpp-package') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu_mkldnn_cpp_test', mx_lib_cpp_capi)
@@ -1000,8 +985,8 @@ def test_unix_scala_mkldnn_cpu(){
 }
 
 def test_unix_scala_gpu() {
-    return ['Scala: GPU': {
-      node(NODE_LINUX_GPU) {
+    return ['Scala: GPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-scala-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib)
@@ -1084,7 +1069,7 @@ def test_unix_perl_cpu() {
 
 def test_unix_cpp_gpu() {
     return ['Cpp: GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-cpp-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cmake_gpu', mx_cmake_lib)
@@ -1125,8 +1110,8 @@ def test_unix_cpp_cpu() {
 }
 
 def test_unix_perl_gpu() {
-    return ['Perl: GPU': {
-      node(NODE_LINUX_GPU) {
+    return ['Perl: GPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-perl-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib)
@@ -1140,7 +1125,7 @@ def test_unix_perl_gpu() {
 
 def test_unix_r_gpu() {
     return ['R: GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-r-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib)
@@ -1208,7 +1193,7 @@ def test_unix_distributed_kvstore_cpu() {
 
 def test_unix_distributed_kvstore_gpu() {
     return ['dist-kvstore tests GPU': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/it-dist-kvstore') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('gpu', mx_lib)
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index f8c28d5d1994..a9feae158311 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -29,7 +29,7 @@ node('utility') {
   utils = load('ci/Jenkinsfile_utils.groovy')
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3')
+utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4')
 
 utils.main_wrapper(
 core_logic: {