diff --git a/3rdparty/mshadow/mshadow/tensor.h b/3rdparty/mshadow/mshadow/tensor.h
index f8f63388155c..8b24d1fa6136 100644
--- a/3rdparty/mshadow/mshadow/tensor.h
+++ b/3rdparty/mshadow/mshadow/tensor.h
@@ -108,7 +108,10 @@ struct Shape {
    * \return the corresponding dimension size
    */
   MSHADOW_XINLINE const index_t &operator[](int idx) const {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
     return shape_[idx];
+#pragma GCC diagnostic pop
   }
   /*!
    * \return whether two shape equals
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d34d5f1935e..dd1206adeab9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,7 +77,6 @@ option(USE_JEMALLOC "Build with Jemalloc support" OFF)
 option(USE_LIBJPEG_TURBO "Use libjpeg-turbo" OFF)
 option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF)
 option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF)
-option(USE_CPP_PACKAGE "Build C++ Package" OFF)
 option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
 option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
index e68e2f94bc02..73494a6784a8 100644
--- a/ci/docker/Dockerfile.build.ubuntu
+++ b/ci/docker/Dockerfile.build.ubuntu
@@ -21,8 +21,8 @@
 # See docker-compose.yml for supported BASE_IMAGE ARGs and targets.
 
 ####################################################################################################
-# The Dockerfile uses a dynamic BASE_IMAGE (for example ubuntu:18.04
-# nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 etc).
+# The Dockerfile uses a dynamic BASE_IMAGE (for example ubuntu:20.04
+# nvidia/cuda:11.1-cudnn8-devel-ubuntu20.04 etc).
 # On top of BASE_IMAGE we install all dependencies shared by all MXNet build
 # environments into a "base" target. At the end of this file, we can specialize
 # "base" for specific usecases. The target built by docker can be selected via
@@ -36,9 +36,7 @@ WORKDIR /work/deps
 RUN export DEBIAN_FRONTEND=noninteractive && \
     apt-get update && \
     apt-get install -y wget software-properties-common && \
-    wget -qO - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
     wget -qO - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | apt-key add - && \
-    apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" &&  \
     apt-add-repository "deb https://apt.repos.intel.com/mkl all main" &&  \
     apt-get update && \
     apt-get install -y \
@@ -47,6 +45,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
         unzip \
         pandoc \
         ## Development tools
+        cmake \
         build-essential \
         ninja-build \
         git \
@@ -123,43 +122,33 @@ COPY runtime_functions.sh /work/
 # The target built by docker can be selected via "--target" option or docker-compose.yml
 ####################################################################################################
 FROM base as gpu
-# Install Thrust 1.9.8 to be shipped with Cuda 11.
-# Fixes https://github.com/thrust/thrust/issues/1072 for Clang 10
-# This file can be deleted when using Cuda 11 on CI
-RUN cd /usr/local && \
-    git clone https://github.com/thrust/thrust.git && \
-    cd thrust && \
-    git checkout 1.9.8
 
-# Install TensorRT
+# Install TensorRT and CuDNN
+# Use bash as it has better support for string comparisons in if clauses
+SHELL ["/bin/bash", "-c"]
 # We need to redeclare ARG due to
 # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 ARG BASE_IMAGE
-# due to issue https://gitlab.com/nvidia/container-images/cuda/-/issues/92, we
-#  get the cuda version from installed deb package if CUDA_VERSION is not set
-RUN if [ -z "$CUDA_VERSION" ]; then export CUDA_VERSION=$( \
-    dpkg --list cuda-libraries*| grep "cuda-libraries-dev-" | awk '{print $3}' | sed 's/-1$//'); \
-    fi && export SHORT_CUDA_VERSION=${CUDA_VERSION%.*} && \
+RUN export SHORT_CUDA_VERSION=${CUDA_VERSION%.*} && \
+    export OS_RELEASE="$(cat /etc/os-release)" && \
     apt-get update && \
-    if [ ${SHORT_CUDA_VERSION} = 10.0 ]; then \
-        TRT_VERSION="7.0.0-1+cuda10.0"; \
-        TRT_MAJOR_VERSION=7; \
-    elif [ ${SHORT_CUDA_VERSION} = 10.1 ]; then \
-        TRT_VERSION="6.0.1-1+cuda10.1"; \
-        TRT_MAJOR_VERSION=6; \
-    elif [ ${SHORT_CUDA_VERSION} = 10.2 ]; then \
-        TRT_VERSION="7.0.0-1+cuda10.2"; \
-        TRT_MAJOR_VERSION=7; \
-    elif [ ${SHORT_CUDA_VERSION} = 11.0 ]; then \
-        TRT_VERSION="7.2.0-1+cuda11.0"; \
-        TRT_MAJOR_VERSION=7; \
-    else \
-        echo "ERROR: Cuda ${SHORT_CUDA_VERSION} not yet supported in Dockerfile.build.ubuntu"; \
-        exit 1; \
+    if [[ ${OS_RELEASE} == *"Bionic"* ]]; then \
+        if [ ${SHORT_CUDA_VERSION} = 11.0 ]; then \
+            TRT_VERSION="7.2.0-1+cuda11.0"; \
+            TRT_MAJOR_VERSION=7; \
+        elif [ ${SHORT_CUDA_VERSION} = 11.1 ]; then \
+            TRT_VERSION="7.2.1-1+cuda11.1"; \
+            TRT_MAJOR_VERSION=7; \
+        else \
+            echo "ERROR: Cuda ${SHORT_CUDA_VERSION} not yet supported in Dockerfile.build.ubuntu"; \
+            exit 1; \
+        fi; \
+        apt-get install -y libnvinfer${TRT_MAJOR_VERSION}=${TRT_VERSION} \
+                           libnvinfer-dev=${TRT_VERSION} \
+                           libnvinfer-plugin${TRT_MAJOR_VERSION}=${TRT_VERSION} \
+                           libnvinfer-plugin-dev=${TRT_VERSION}; \
     fi && \
-    apt-get install -y libnvinfer${TRT_MAJOR_VERSION}=${TRT_VERSION} \
-                       libnvinfer-dev=${TRT_VERSION} \
-                       libnvinfer-plugin${TRT_MAJOR_VERSION}=${TRT_VERSION} \
-                       libnvinfer-plugin-dev=${TRT_VERSION} && \
+    apt-get install -y libcudnn8-dev && \
     rm -rf /var/lib/apt/lists/*
 
+ENV CUDNN_VERSION=8.0.5
diff --git a/ci/docker/docker-compose.yml b/ci/docker/docker-compose.yml
index c7a11c499fdf..6ebc6a0e025e 100644
--- a/ci/docker/docker-compose.yml
+++ b/ci/docker/docker-compose.yml
@@ -85,49 +85,29 @@ services:
       dockerfile: Dockerfile.build.ubuntu
       target: base
       args:
-        BASE_IMAGE: ubuntu:18.04
+        BASE_IMAGE: ubuntu:20.04
       cache_from:
         - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_cpu:latest
-  ubuntu_gpu_cu101:
-    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu101:latest
+  ubuntu_tensorrt_cu111:
+    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_tensorrt_cu111:latest
     build:
       context: .
       dockerfile: Dockerfile.build.ubuntu
       target: gpu
       args:
-        BASE_IMAGE: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+        BASE_IMAGE: nvidia/cuda:11.1-devel-ubuntu18.04
       cache_from:
-        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu101:latest
-  ubuntu_gpu_cu102:
-    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu102:latest
+        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_tensorrt_cu111:latest
+  ubuntu_gpu_cu111:
+    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu111:latest
     build:
       context: .
       dockerfile: Dockerfile.build.ubuntu
       target: gpu
       args:
-        BASE_IMAGE: nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+        BASE_IMAGE: nvidia/cuda:11.1-devel-ubuntu20.04
       cache_from:
-        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu102:latest
-  ubuntu_gpu_cu110:
-    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu110:latest
-    build:
-      context: .
-      dockerfile: Dockerfile.build.ubuntu
-      target: gpu
-      args:
-        BASE_IMAGE: nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
-      cache_from:
-        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu110:latest
-  ubuntu_build_cuda:
-    image: ${DOCKER_CACHE_REGISTRY}/build.ubuntu_build_cuda:latest
-    build:
-      context: .
-      dockerfile: Dockerfile.build.ubuntu
-      target: gpu
-      args:
-        BASE_IMAGE: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-      cache_from:
-        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_build_cuda:latest
+        - ${DOCKER_CACHE_REGISTRY}/build.ubuntu_gpu_cu111:latest
   ###################################################################################################
   # Dockerfile.build.android based images used for testing cross-compilation for plain ARM
   ###################################################################################################
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 88e77a77a73e..1af9d878cb0f 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -314,11 +314,10 @@ build_ubuntu_cpu() {
 build_ubuntu_cpu_openblas() {
     set -ex
     cd /work/build
-    CXXFLAGS="-Wno-error=strict-overflow" CC=gcc-7 CXX=g++-7 cmake \
+    CXXFLAGS="-Wno-error=strict-overflow" cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_TVM_OP=ON \
-        -DUSE_CPP_PACKAGE=ON \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=OFF \
         -DUSE_CUDA=OFF \
@@ -332,7 +331,7 @@ build_ubuntu_cpu_openblas() {
 build_ubuntu_cpu_mkl() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=OFF \
         -DUSE_MKLDNN=OFF \
@@ -348,7 +347,7 @@ build_ubuntu_cpu_mkl() {
 build_ubuntu_cpu_cmake_debug() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE=Debug \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_CUDA=OFF \
@@ -365,7 +364,7 @@ build_ubuntu_cpu_cmake_debug() {
 build_ubuntu_cpu_cmake_no_tvm_op() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
@@ -384,8 +383,6 @@ build_ubuntu_cpu_cmake_asan() {
     set -ex
 
     cd /work/build
-    export CXX=g++-8
-    export CC=gcc-8
     cmake \
         -DUSE_CUDA=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
@@ -396,8 +393,6 @@ build_ubuntu_cpu_cmake_asan() {
         -DUSE_GPERFTOOLS=OFF \
         -DUSE_JEMALLOC=OFF \
         -DUSE_ASAN=ON \
-        -DUSE_CPP_PACKAGE=ON \
-        -DMXNET_USE_CPU=ON \
         /work/mxnet
     make -j $(nproc) mxnet
 }
@@ -405,11 +400,9 @@ build_ubuntu_cpu_cmake_asan() {
 build_ubuntu_cpu_gcc8_werror() {
     set -ex
     cd /work/build
-    CXX=g++-8 CC=gcc-8 cmake \
+    CC=gcc-8 CXX=g++-8 cmake \
         -DUSE_CUDA=OFF \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-        -DUSE_CPP_PACKAGE=ON \
-        -DMXNET_USE_CPU=ON \
         -GNinja /work/mxnet
     ninja
 }
@@ -420,8 +413,6 @@ build_ubuntu_cpu_clang10_werror() {
     CXX=clang++-10 CC=clang-10 cmake \
        -DUSE_CUDA=OFF \
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-       -DUSE_CPP_PACKAGE=ON \
-       -DMXNET_USE_CPU=ON \
        -GNinja /work/mxnet
     ninja
 }
@@ -438,9 +429,9 @@ build_ubuntu_gpu_clang10_werror() {
 
     CXX=clang++-10 CC=clang-10 cmake \
        -DUSE_CUDA=ON \
+       -DUSE_NVML=OFF \
        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-       -DUSE_CPP_PACKAGE=OFF \
        -GNinja /work/mxnet
     ninja
 }
@@ -454,7 +445,6 @@ build_ubuntu_cpu_clang6() {
         -DUSE_CUDA=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_DIST_KVSTORE=ON \
-        -DUSE_CPP_PACKAGE=ON \
         -G Ninja /work/mxnet
     ninja
 }
@@ -468,7 +458,6 @@ build_ubuntu_cpu_clang100() {
        -DUSE_CUDA=OFF \
        -DUSE_OPENMP=ON \
        -DUSE_DIST_KVSTORE=ON \
-       -DUSE_CPP_PACKAGE=ON \
        -G Ninja /work/mxnet
     ninja
 }
@@ -484,7 +473,6 @@ build_ubuntu_cpu_clang_tidy() {
        -DUSE_OPENMP=OFF \
        -DCMAKE_BUILD_TYPE=Debug \
        -DUSE_DIST_KVSTORE=ON \
-       -DUSE_CPP_PACKAGE=ON \
        -DCMAKE_CXX_CLANG_TIDY=clang-tidy-10 \
        -G Ninja /work/mxnet
     ninja
@@ -497,7 +485,6 @@ build_ubuntu_cpu_clang6_mkldnn() {
        -DUSE_MKL_IF_AVAILABLE=OFF \
        -DUSE_MKLDNN=ON \
        -DUSE_CUDA=OFF \
-       -DUSE_CPP_PACKAGE=ON \
        -DUSE_OPENMP=OFF \
        -G Ninja /work/mxnet
     ninja
@@ -510,7 +497,6 @@ build_ubuntu_cpu_clang100_mkldnn() {
        -DUSE_MKL_IF_AVAILABLE=OFF \
        -DUSE_MKLDNN=ON \
        -DUSE_CUDA=OFF \
-       -DUSE_CPP_PACKAGE=ON \
        -G Ninja /work/mxnet
     ninja
 }
@@ -518,14 +504,13 @@ build_ubuntu_cpu_clang100_mkldnn() {
 build_ubuntu_cpu_mkldnn() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=OFF \
-        -DUSE_CPP_PACKAGE=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -G Ninja /work/mxnet
     ninja
@@ -534,29 +519,24 @@ build_ubuntu_cpu_mkldnn() {
 build_ubuntu_cpu_mkldnn_mkl() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=OFF \
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_MKL_IF_AVAILABLE=ON \
+        -DMKL_USE_STATIC_LIBS=OFF \
         -DUSE_BLAS=MKL \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -GNinja /work/mxnet
     ninja
 }
 
-build_ubuntu_gpu() {
-    build_ubuntu_gpu_cuda101_cudnn7
-}
-
 build_ubuntu_gpu_tensorrt() {
 
     set -ex
 
-    export CC=gcc-7
-    export CXX=g++-7
     export ONNX_NAMESPACE=onnx
 
     # Build ONNX
@@ -597,6 +577,7 @@ build_ubuntu_gpu_tensorrt() {
           -DUSE_TENSORRT=1                        \
           -DUSE_OPENMP=0                          \
           -DUSE_MKLDNN=0                          \
+          -DUSE_NVML=OFF                          \
           -DUSE_MKL_IF_AVAILABLE=OFF              \
           -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
           -G Ninja                                \
@@ -608,12 +589,12 @@ build_ubuntu_gpu_tensorrt() {
 build_ubuntu_gpu_mkldnn() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_CUDA=ON \
+        -DUSE_NVML=OFF \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_CPP_PACKAGE=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -G Ninja /work/mxnet
     ninja
@@ -622,29 +603,29 @@ build_ubuntu_gpu_mkldnn() {
 build_ubuntu_gpu_mkldnn_nocudnn() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_CUDA=ON \
+        -DUSE_NVML=OFF \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=OFF \
-        -DUSE_CPP_PACKAGE=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -G Ninja /work/mxnet
     ninja
 }
 
-build_ubuntu_gpu_cuda101_cudnn7() {
+build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_CUDA=ON \
+        -DUSE_NVML=OFF \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
         -DUSE_MKLDNN=OFF \
-        -DUSE_CPP_PACKAGE=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
@@ -652,47 +633,27 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     ninja
 }
 
-build_ubuntu_gpu_cuda101_cudnn7_debug() {
+build_ubuntu_gpu_debug() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DCMAKE_BUILD_TYPE=Debug \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_CUDA=ON \
+        -DUSE_NVML=OFF \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
         -DUSE_MKLDNN=OFF \
-        -DUSE_CPP_PACKAGE=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -G Ninja /work/mxnet
     ninja
 }
 
-build_ubuntu_gpu_cmake() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
-        -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=OFF                        \
-        -DUSE_DIST_KVSTORE=ON                   \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DBUILD_CYTHON_MODULES=1                \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=OFF                          \
         -DUSE_CUDNN=OFF                         \
@@ -706,12 +667,12 @@ build_ubuntu_cpu_large_tensor() {
 build_ubuntu_gpu_large_tensor() {
     set -ex
     cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
+    cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_NVML=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=ON                         \
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index fca8e44b25ff..1413756dc2c8 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -139,7 +139,7 @@ def compile_unix_int64_gpu(lib_name) {
         ws('workspace/build-gpu-int64') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_large_tensor', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu_large_tensor', false)
             utils.pack_lib(lib_name, mx_cmake_lib)
           }
         }
@@ -195,7 +195,7 @@ def compile_unix_mkldnn_gpu(lib_name) {
         ws('workspace/build-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu_mkldnn', false)
             utils.pack_lib(lib_name, mx_mkldnn_lib)
           }
         }
@@ -209,7 +209,7 @@ def compile_unix_mkldnn_nocudnn_gpu(lib_name) {
          ws('workspace/build-mkldnn-gpu-nocudnn') {
            timeout(time: max_time, unit: 'MINUTES') {
              utils.init_git()
-             utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
+             utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
              utils.pack_lib(lib_name, mx_mkldnn_lib)
            }
          }
@@ -218,12 +218,12 @@ def compile_unix_mkldnn_nocudnn_gpu(lib_name) {
 }
 
 def compile_unix_full_gpu(lib_name) {
-    return ['GPU: CUDA10.1+cuDNN7': {
+    return ['GPU: CUDA+cuDNN': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu', false)
             utils.pack_lib(lib_name, mx_lib_cpp_examples)
           }
         }
@@ -232,12 +232,12 @@ def compile_unix_full_gpu(lib_name) {
 }
 
 def compile_unix_full_gpu_debug(lib_name) {
-    return ['GPU: CUDA10.1+cuDNN7, debug': {
+    return ['GPU: CUDA+cuDNN, debug': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_debug', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu_debug', false)
             utils.pack_lib(lib_name, mx_lib_cpp_examples)
           }
         }
@@ -245,27 +245,13 @@ def compile_unix_full_gpu_debug(lib_name) {
     }]
 }
 
-def compile_unix_cmake_gpu(lib_name) {
-    return ['GPU: CMake': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib(lib_name, mx_cmake_lib_cython)
-          }
-        }
-      }
-    }]
-}
-
 def compile_unix_tensorrt_gpu(lib_name) {
     return ['TensorRT': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-tensorrt') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu102', 'build_ubuntu_gpu_tensorrt', false)
+            utils.docker_run('ubuntu_tensorrt_cu111', 'build_ubuntu_gpu_tensorrt', false)
             utils.pack_lib(lib_name, mx_tensorrt_lib)
           }
         }
@@ -510,7 +496,7 @@ def compile_unix_clang10_cuda_werror(lib_name) {
         ws('workspace/build-cpu-clang10') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_clang10_werror', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu_clang10_werror', false)
             utils.pack_lib(lib_name, mx_lib)
           }
         }
@@ -696,7 +682,7 @@ def test_unix_python3_gpu(lib_name) {
         ws('workspace/ut-python3-gpu') {
           try {
             utils.unpack_and_init(lib_name, mx_lib_cython)
-            python3_gpu_ut_cython('ubuntu_gpu_cu101')
+            python3_gpu_ut_cython('ubuntu_gpu_cu111')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_gpu.xml')
@@ -778,7 +764,7 @@ def test_unix_python3_mkldnn_gpu(lib_name) {
         ws('workspace/ut-python3-mkldnn-gpu') {
           try {
             utils.unpack_and_init(lib_name, mx_mkldnn_lib)
-            python3_gpu_ut('ubuntu_gpu_cu101')
+            python3_gpu_ut('ubuntu_gpu_cu111')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_mkldnn_gpu.xml')
@@ -794,7 +780,7 @@ def test_unix_python3_mkldnn_nocudnn_gpu(lib_name) {
         ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
           try {
             utils.unpack_and_init(lib_name, mx_mkldnn_lib)
-            python3_gpu_ut_nocudnn('ubuntu_gpu_cu101')
+            python3_gpu_ut_nocudnn('ubuntu_gpu_cu111')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_mkldnn_gpu_nocudnn.xml')
@@ -838,7 +824,7 @@ def test_unix_byteps_gpu(lib_name) {
         ws('workspace/it-byteps') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib)
-            utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_byteps', true, '32768m')
+            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_gpu_byteps', true, '32768m')
             utils.publish_test_coverage()
           }
         }
@@ -852,7 +838,7 @@ def test_unix_distributed_kvstore_gpu(lib_name) {
         ws('workspace/it-dist-kvstore') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib)
-            utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
+            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
             utils.publish_test_coverage()
           }
         }
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 6fbdc751ea90..63bd3d29b641 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -38,7 +38,6 @@ core_logic: {
     custom_steps.compile_unix_mkldnn_nocudnn_gpu('mkldnn_gpu_nocudnn'),
     custom_steps.compile_unix_full_gpu('gpu'),
     custom_steps.compile_unix_full_gpu_debug('gpu_debug'),
-    custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
     custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
     custom_steps.compile_unix_int64_gpu('gpu_int64'),
   ])
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index daa33b856dbc..2be3448d3279 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -281,7 +281,11 @@ def _load_lib():
         # pylint: disable=E1123
         lib = ctypes.CDLL(lib_path[0], winmode=0x00000008)
     else:
-        lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
+        # We use RTLD_GLOBAL as, when dynamically linking with MKL,
+        # libmkl_core.so may load libmkl_avx512.so via dlopen. When opening
+        # libmxnet and it's dependencies (libmkl_core.so) via RTLD_LOCAL, MKL's
+        # dlopen calls will fail with undefined symbol errors.
+        lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
     # DMatrix functions
     lib.MXGetLastError.restype = ctypes.c_char_p
     return lib
diff --git a/python/mxnet/contrib/text/utils.py b/python/mxnet/contrib/text/utils.py
index 79cf903edbee..46e93daecb01 100644
--- a/python/mxnet/contrib/text/utils.py
+++ b/python/mxnet/contrib/text/utils.py
@@ -77,7 +77,7 @@ def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
         source_str = [t.lower() for t in source_str]
 
     if counter_to_update is None:
-        return collections.Counter(source_str)
+        return collections.Counter(source_str)  # pylint: disable=too-many-function-args
     else:
         counter_to_update.update(source_str)
         return counter_to_update
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 6f74b49d01a2..e260a41849ad 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -260,8 +260,8 @@ def _wrap_mxnp_np_ufunc(x1, x2):
         return func(x1, x2)
     return _wrap_mxnp_np_ufunc
 
-@set_module('mxnet.numpy')  # pylint: disable=invalid-name
-class ndarray(NDArray):
+@set_module('mxnet.numpy')
+class ndarray(NDArray):  # pylint: disable=invalid-name
     """
     ndarray(handle, writable=True):
 
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index e30e977c34bb..6e7aca221e9e 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -47,11 +47,13 @@ inline bool NumpyIndicesShape(const nnvm::NodeAttrs& attrs,
     << "_npi_indices dimensions the number of dim must not be less than 0";
   mxnet::TShape param_dim = param.dimensions;
   if (!shape_is_known(param_dim)) return false;
+  CHECK_LT(param_dim.Size(), INT32_MAX) << "ValueError: np.indices does not support large"
+     << " input tensors (containing >= 2^31 elements).";
   const int indim = param.dimensions.ndim();
   mxnet::TShape ret(indim + 1, -1);
   ret[0] = indim;
   for (int i = 1; i < indim + 1; ++i) {
-    ret[i] = param.dimensions[i-1];
+    ret[i] = param_dim[i-1];
   }
   SHAPE_ASSIGN_CHECK(*out_shapes, 0, ret);
   return shape_is_known(out_shapes->at(0));
diff --git a/src/operator/numpy/np_repeat_op-inl.h b/src/operator/numpy/np_repeat_op-inl.h
index 638f1dee921a..aa51d080e57e 100644
--- a/src/operator/numpy/np_repeat_op-inl.h
+++ b/src/operator/numpy/np_repeat_op-inl.h
@@ -110,10 +110,14 @@ inline bool RepeatsOpShape(const nnvm::NodeAttrs& attrs,
         shape[i] = ishape[i];
       }
     }
+    CHECK_LT(shape.Size(), INT32_MAX) << "ValueError: np.repeat does not support large"
+      << " input tensors (containing >= 2^31 elements).";
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   } else {  // If axis is not input by user, return a flat 1D array of size = repeats
     repeats = param.repeats.value().ndim() == 1 ? ishape.Size() * repeats : repeats;
     mxnet::TShape shape(1, repeats);
+    CHECK_LT(shape.Size(), INT32_MAX) << "ValueError: np.repeat does not support large"
+      << " input tensors (containing >= 2^31 elements).";
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
   return shape_is_known(out_attrs->at(0));
diff --git a/src/operator/numpy/np_unique_op.cc b/src/operator/numpy/np_unique_op.cc
index 39a84bad3779..7076b44ac3af 100644
--- a/src/operator/numpy/np_unique_op.cc
+++ b/src/operator/numpy/np_unique_op.cc
@@ -348,6 +348,8 @@ void NumpyUniqueCPUForward(const nnvm::NodeAttrs& attrs,
       const_cast<NDArray &>(outputs[output_flag]).Init(shape_0);
     }
   } else {
+    CHECK_LT(inputs[0].shape().Size(), INT32_MAX) << "ValueError: np.unique does not support large"
+      << " input tensors (containing >= 2^31).";
     if (!param.axis.has_value()) {
       NumpyUniqueCPUNoneAxisImpl(param, ctx, inputs, req, outputs);
     } else {
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index 132a9f90ec68..6dc3cf46c895 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -54,7 +54,12 @@ struct static_string {
   inline explicit static_string(const char *s) { set(s); }
   inline const char *c_str() const { return &string_[0]; }
   inline void set(const char *s) {
+#pragma GCC diagnostic push
+#if __GNUC__ >= 8
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
+#endif
     strncpy(&string_[0], s, string_size - 1);
+#pragma GCC diagnostic pop
     string_[string_size - 1] = '\0';
   }
   inline void append(const char *s) {
diff --git a/tests/nightly/test_np_large_array.py b/tests/nightly/test_np_large_array.py
index a1a34d878c84..dd4a94c1a1cb 100644
--- a/tests/nightly/test_np_large_array.py
+++ b/tests/nightly/test_np_large_array.py
@@ -2132,6 +2132,23 @@ def test_dsplit():
 
 
 @use_np
+def test_unique():
+    inp = np.zeros((2, HALF_INT_OVERFLOW))
+    assertRaises(ValueError, np.unique, inp, axis=1)
+
+
+@use_np
+def test_repeat():
+    inp = np.ones((2, HALF_INT_OVERFLOW))
+    assertRaises(ValueError, np.repeat, inp, repeats=2, axis=1)
+
+
+@use_np
+def test_indices():
+    assertRaises(ValueError, np.indices, (2, HALF_INT_OVERFLOW))
+
+
+@use_np    
 def test_tril_indices():
     N = 2**16
     data = np.tril_indices(N, -1)
@@ -2328,4 +2345,3 @@ def test_insert():
     assert out[0, 1] == 1 and out[-1, 1] == 2
     assert out2[1] == 5 and out2[2] == 6
     assertRaises(MXNetError, np.insert, arr=inp3, obj=np.array([2, 2], dtype=np.int64), values=np.array([5, 6]))
-
diff --git a/tests/python/unittest/onnx/test_onnxruntime.py b/tests/python/unittest/onnx/test_onnxruntime.py
index dfd114a9ff23..3737b7ad0a79 100644
--- a/tests/python/unittest/onnx/test_onnxruntime.py
+++ b/tests/python/unittest/onnx/test_onnxruntime.py
@@ -22,12 +22,11 @@
 import json
 import os
 import shutil
-import tempfile
 
 import pytest
 
 
-def run_cv_model_test(model):
+def run_cv_model_test(model, tmpdir):
     def get_gluon_cv_model(model_name, tmp):
         tmpfile = os.path.join(tmp, model_name)
         ctx = mx.cpu(0)
@@ -66,11 +65,13 @@ def softmax(x):
         e_x = np.exp(x - np.max(x))
         return e_x / e_x.sum(axis=0)
 
-    def load_imgnet_labels():
-        mx.test_utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/image_net_labels.json')
-        return np.array(json.load(open('image_net_labels.json', 'r')))
+    def load_imgnet_labels(tmpdir):
+        tmpfile = os.path.join(tmpdir, 'image_net_labels.json')
+        mx.test_utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/image_net_labels.json',
+                               fname=tmpfile)
+        return np.array(json.load(open(tmpfile, 'r')))
 
-    def download_test_images():
+    def download_test_images(tmpdir):
         test_images = [
             ['dog.jpg',['boxer']],
             ['apron.jpg', ['apron', 'maillot']],
@@ -80,13 +81,11 @@ def download_test_images():
         ]
         for f,_ in test_images:
             mx.test_utils.download('https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/onnx/images/'+f+'?raw=true',
-                                   fname=f)
+                                   fname=os.path.join(tmpdir, f))
         return test_images
 
-    labels = load_imgnet_labels()
-    test_images = download_test_images()
-
-    tmpdir = tempfile.mkdtemp()
+    labels = load_imgnet_labels(tmpdir)
+    test_images = download_test_images(tmpdir)
     sym_file, params_file = get_gluon_cv_model(model, tmpdir)
     onnx_file = export_model_to_onnx(sym_file, params_file)
     #print("exported onnx file: ",onnx_file)
@@ -98,7 +97,7 @@ def download_test_images():
     input_name = session.get_inputs()[0].name
 
     for img,classes in test_images:
-        img_data = normalize_image(img)
+        img_data = normalize_image(os.path.join(tmpdir, img))
         raw_result = session.run([], {input_name: img_data})
         res = softmax(np.array(raw_result)).tolist()
         class_idx = np.argmax(res)
@@ -116,67 +115,51 @@ def download_test_images():
     shutil.rmtree(tmpdir)
 
 @pytest.mark.skip(reason="Older gluon models are not supported, tracked with #19580")
-def test_cv_model_inference_onnxruntime_mobilenet0_5():
-    run_cv_model_test('mobilenet0.5')
+def test_cv_model_inference_onnxruntime_mobilenet0_5(tmp_path):
+    run_cv_model_test('mobilenet0.5', tmp_path)
 
-@pytest.mark.flaky
-def test_cv_model_inference_onnxruntime_mobilenetv2_1_0():
-    run_cv_model_test('mobilenetv2_1.0')
+def test_cv_model_inference_onnxruntime_mobilenetv2_1_0(tmp_path):
+    run_cv_model_test('mobilenetv2_1.0', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet18_v1():
-    run_cv_model_test('resnet18_v1')
+def test_cv_model_inference_onnxruntime_resnet18_v1(tmp_path):
+    run_cv_model_test('resnet18_v1', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet18_v2():
-    run_cv_model_test('resnet18_v2')
+def test_cv_model_inference_onnxruntime_resnet18_v2(tmp_path):
+    run_cv_model_test('resnet18_v2', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet101_v1():
-    run_cv_model_test('resnet101_v1')
+def test_cv_model_inference_onnxruntime_resnet101_v1(tmp_path):
+    run_cv_model_test('resnet101_v1', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet101_v2():
-    run_cv_model_test('resnet101_v2')
+def test_cv_model_inference_onnxruntime_resnet101_v2(tmp_path):
+    run_cv_model_test('resnet101_v2', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet152_v1():
-    run_cv_model_test('resnet152_v1')
+def test_cv_model_inference_onnxruntime_resnet152_v1(tmp_path):
+    run_cv_model_test('resnet152_v1', tmp_path)
 
-def test_cv_model_inference_onnxruntime_resnet152_v2():
-    run_cv_model_test('resnet152_v2')
+def test_cv_model_inference_onnxruntime_resnet152_v2(tmp_path):
+    run_cv_model_test('resnet152_v2', tmp_path)
 
 @pytest.mark.skip(reason="Older gluon models are not supported, tracked with #19580")
-def test_cv_model_inference_onnxruntime_squeezenet1_0():
-    run_cv_model_test('squeezenet1.0')
+def test_cv_model_inference_onnxruntime_squeezenet1_0(tmp_path):
+    run_cv_model_test('squeezenet1.0', tmp_path)
 
 @pytest.mark.skip(reason="Older gluon models are not supported, tracked with #19580")
-def test_cv_model_inference_onnxruntime_squeezenet1_1():
-    run_cv_model_test('squeezenet1.1')
+def test_cv_model_inference_onnxruntime_squeezenet1_1(tmp_path):
+    run_cv_model_test('squeezenet1.1', tmp_path)
 
 @pytest.mark.skip(reason="Older gluon models are not supported, tracked with #19580")
-def test_cv_model_inference_onnxruntime_vgg11():
-    run_cv_model_test('vgg11')
+def test_cv_model_inference_onnxruntime_vgg11(tmp_path):
+    run_cv_model_test('vgg11', tmp_path)
 
 @pytest.mark.skip(reason="Older gluon models are not supported, tracked with #19580")
-def test_cv_model_inference_onnxruntime_vgg11_bn():
-    run_cv_model_test('vgg11_bn')
-
-def test_cv_model_inference_onnxruntime_vgg19():
-    run_cv_model_test('vgg19')
-
-def test_cv_model_inference_onnxruntime_vgg19_bn():
-    run_cv_model_test('vgg19_bn')
-
-if __name__ == "__main__":
-    test_cv_model_inference_onnxruntime_mobilenet0_5()
-    test_cv_model_inference_onnxruntime_mobilenetv2_1_0()
-    test_cv_model_inference_onnxruntime_resnet18_v1()
-    test_cv_model_inference_onnxruntime_resnet18_v2()
-    test_cv_model_inference_onnxruntime_resnet101_v1()
-    test_cv_model_inference_onnxruntime_resnet101_v2()
-    test_cv_model_inference_onnxruntime_resnet152_v1()
-    test_cv_model_inference_onnxruntime_resnet152_v2()
-    test_cv_model_inference_onnxruntime_squeezenet1_0()
-    test_cv_model_inference_onnxruntime_squeezenet1_1()
-    test_cv_model_inference_onnxruntime_vgg11()
-    test_cv_model_inference_onnxruntime_vgg11_bn()
-    test_cv_model_inference_onnxruntime_vgg19()
-    test_cv_model_inference_onnxruntime_vgg19_bn()
+def test_cv_model_inference_onnxruntime_vgg11_bn(tmp_path):
+    run_cv_model_test('vgg11_bn', tmp_path)
+
+def test_cv_model_inference_onnxruntime_vgg19(tmp_path):
+    run_cv_model_test('vgg19', tmp_path)
+
+def test_cv_model_inference_onnxruntime_vgg19_bn(tmp_path):
+    run_cv_model_test('vgg19_bn', tmp_path)
+