diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 52c3052df8ec..cb2cc7ac17ff 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 52c3052df8ec1d5b8b45cb6c350a952840eabd42
+Subproject commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d
diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
index e9f172a570fe..fa9300db3ca0 100644
--- a/cd/python/pypi/Jenkins_pipeline.groovy
+++ b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -27,7 +27,7 @@
 // This is a temporary solution until we are confident with the packages generated by CI
 // This should be removed in the not too distant future.
 // We only skip the publish step so we can still QA the other variants.
-pypi_releases = ["cu92", "cu92mkl"]
+pypi_releases = []
 
 def get_pipeline(mxnet_variant) {
   def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU
@@ -72,6 +72,7 @@ def push(mxnet_variant) {
     } else {
       echo "Temporarily skipping publishing PyPI package for '${mxnet_variant}'."
     }
+    sh "./ci/docker/runtime_functions.sh cd_s3_publish"
   }
 }
 
diff --git a/cd/python/pypi/pypi_publish.py b/cd/python/pypi/pypi_publish.py
index 7e09f644c734..2729068dd503 100755
--- a/cd/python/pypi/pypi_publish.py
+++ b/cd/python/pypi/pypi_publish.py
@@ -35,10 +35,8 @@ def post_wheel(path):
     logging.info('Posting {} to PyPI'.format(path))
     pypi_credentials = get_secret()
 
-    cmd = 'python3 -m twine upload --username {} --password {} {}'.format(
-        pypi_credentials['username'],
-        pypi_credentials['password'],
-        path)
+    cmd = 'python3 -m twine upload {}'.format(path)
+    version = os.path.basename(path).split('-')[1]
 
     # The PyPI credentials for DEV has username set to 'skipPublish'
     # This way we do not attempt to publish the PyPI package
@@ -47,14 +45,15 @@ def post_wheel(path):
         print('In DEV account, skipping publish')
         print('Would have run: {}'.format(cmd))
         return 0
-    else:
+    elif any(test_version_mark in version for test_version_mark in ['a', 'b', 'dev']):
         print('Skipping publishing nightly builds to Pypi.')
         print('See https://github.com/pypa/pypi-support/issues/50 for details')
         return 0
-
-        # DO NOT PRINT CMD IN THIS BLOCK, includes password
-        p = subprocess.run(cmd.split(' '),
-                        stdout=subprocess.PIPE)
+    else:
+        env = os.environ.copy()
+        env['TWINE_USERNAME'] = pypi_credentials['username']
+        env['TWINE_PASSWORD'] = pypi_credentials['password']
+        p = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE, env=env)
         logging.info(p.stdout)
         return p.returncode
 
@@ -85,7 +84,7 @@ def get_secret():
             raise e
     else:
         return json.loads(get_secret_value_response['SecretString'])
-        
-            
+
+
 if __name__ == '__main__':
     sys.exit(post_wheel(sys.argv[1]))
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index cbfc521e2c08..fd716f5fa815 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -26,8 +26,8 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy>1.16.0,<2.0.0
+numpy>1.16.0,<1.18.0
 pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
-scipy==1.0.1
+scipy==1.2.1
 six==1.11.0
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b658f953a78a..e078b2a8f89c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -2065,6 +2065,15 @@ cd_pypi_publish() {
     ./cd/python/pypi/pypi_publish.py `readlink -f wheel_build/dist/*.whl`
 }
 
+cd_s3_publish() {
+    set -ex
+    pip3 install --user awscli
+    filepath=$(readlink -f wheel_build/dist/*.whl)
+    filename=$(basename $file_path)
+    variant=$(echo $filename | cut -d'-' -f1 | cut -d'_' -f2 -s)
+    aws s3 cp --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers,full=id=43f628fab72838a4f0b929d7f1993b14411f4b0294b011261bc6bd3e950a6822 s3://apache-mxnet/dist/${variant}/${filename}
+}
+
 build_static_scala_mkl() {
     set -ex
     pushd .
diff --git a/example/neural_collaborative_filtering/README.md b/example/neural_collaborative_filtering/README.md
index 819f4d94dff9..00d3ed12295b 100644
--- a/example/neural_collaborative_filtering/README.md
+++ b/example/neural_collaborative_filtering/README.md
@@ -29,15 +29,6 @@ Author: Dr. Xiangnan He (http://www.comp.nus.edu.sg/~xiangnan/)
 
 Code Reference: https://github.com/hexiangnan/neural_collaborative_filtering
 
-## Environment Settings
-We use MXnet with MKL-DNN as the backend. 
-- MXNet version:  '1.5.1'
-
-## Install
-```
-pip install -r requirements.txt
-```
-
 ## Dataset
 
 We provide the processed datasets on [Google Drive](https://drive.google.com/drive/folders/1qACR_Zhc2O2W0RrazzcepM2vJeh0MMdO?usp=sharing): MovieLens 20 Million (ml-20m), you can download directly or 
@@ -66,7 +57,9 @@ We provide the pretrained ml-20m model on [Google Drive](https://drive.google.co
 |dtype|HR@10|NDCG@10|
 |:---:|:--:|:--:|
 |float32|0.6393|0.3849|
-|int8|0.6366|0.3824|
+|float32 opt|0.6393|0.3849|
+|int8|0.6395|0.3852|
+|int8 opt|0.6396|0.3852|
 
 ## Training
 
@@ -75,11 +68,20 @@ We provide the pretrained ml-20m model on [Google Drive](https://drive.google.co
 python train.py # --gpu=0
 ```
 
+## Model Optimizer
+
+```
+# optimize model
+python model_optimizer.py
+```
+
 ## Calibration
 
 ```
 # neumf calibration on ml-20m dataset
 python ncf.py --prefix=./model/ml-20m/neumf --calibration
+# optimized neumf calibration on ml-20m dataset
+python ncf.py --prefix=./model/ml-20m/neumf-opt --calibration
 ```
 
 ## Evaluation
@@ -87,15 +89,25 @@ python ncf.py --prefix=./model/ml-20m/neumf --calibration
 ```
 # neumf float32 inference on ml-20m dataset
 python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf
+# optimized neumf float32 inference on ml-20m dataset
+python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt
 # neumf int8 inference on ml-20m dataset
 python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-quantized
+# optimized neumf int8 inference on ml-20m dataset
+python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt-quantized
 ```
 
 ## Benchmark
 
 ```
+usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]
+
 # neumf float32 benchmark on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf --benchmark
+sh benchmark.sh -p model/ml-20m/neumf
+# optimized neumf float32 benchmark on ml-20m dataset
+sh benchmark.sh -p model/ml-20m/neumf-opt
 # neumf int8 benchmark on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-quantized --benchmark
+sh benchmark.sh -p model/ml-20m/neumf-quantized
+# optimized neumf int8 benchmark on ml-20m dataset
+sh benchmark.sh -p model/ml-20m/neumf-opt-quantized
 ```
diff --git a/example/neural_collaborative_filtering/benchmark.sh b/example/neural_collaborative_filtering/benchmark.sh
new file mode 100644
index 000000000000..60fec746cd20
--- /dev/null
+++ b/example/neural_collaborative_filtering/benchmark.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+usage()
+{
+    echo "usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]"
+}
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --prefix | -p)
+      shift
+      PREFIX=$1
+      ;;
+    --epoch | -e)
+      shift
+      EPOCH=$1
+      ;;
+    --dataset | -d)
+      shift
+      DATASET=$1
+      ;;
+    --batch-size | -b)
+      shift
+      BS=$1
+      ;;
+    --instance | -i)
+      shift
+      INS=$1
+      ;;
+    --core | -c)
+      shift
+      CORES=$1
+      ;;
+    --help | -h)
+      usage
+      exit 1
+      ;;
+    *)
+      usage
+      exit 1
+  esac
+  shift
+done
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+NUM_NUMA_NODE=`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+CORES_PER_NUMA=$((NUM_CORES / NUM_NUMA_NODE))
+echo "target machine has $NUM_CORES physical core(s) on $NUM_NUMA_NODE numa nodes of $NUM_SOCKET socket(s)."
+
+if [ -z $PREFIX ]; then
+  echo "Error: Need a model prefix."
+  exit
+fi
+if [ -z $EPOCH ]; then
+  echo "Default: set epoch of model parameters to 7."
+  EPOCH=7
+fi
+if [ -z $DATASET ]; then
+  echo "Default: set dataset to ml-20m."
+  DATASET='ml-20m'
+fi
+if [ -z $INS ]; then
+  echo "Default: launch one instance per physical core."
+  INS=$NUM_CORES
+fi
+if [ -z $CORES ]; then
+  echo "Default: divide full physical cores."
+  CORES=$((NUM_CORES / $INS))
+fi
+if [ -z $BS ]; then
+  echo "Default: set batch size to 700."
+  BS=700
+fi
+
+echo "  cores/instance: $CORES"
+echo "  total instances: $INS"
+echo "  batch size: $BS"
+echo ""
+
+rm NCF_*.log
+
+for((i=0;i<$INS;i++));
+do
+  ((a=$i*$CORES))
+  ((b=$a+$CORES-1))
+  memid=$((b/CORES_PER_NUMA))
+  LOG=NCF_$i.log
+  echo "  $i instance use $a-$b cores with $LOG"
+  KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 \
+  OMP_NUM_THREADS=$CORES \
+  numactl --physcpubind=$a-$b --membind=$memid python ncf.py --batch-size=$BS --dataset=$DATASET --epoch=$EPOCH --benchmark --prefix=$PREFIX 2>&1 | tee $LOG &
+done
+wait
+
+grep speed NCF_*.log | awk '{ sum += $(NF-1) }; END { print "Total Performance is " sum " samples/sec"}'
diff --git a/example/neural_collaborative_filtering/convert.py b/example/neural_collaborative_filtering/convert.py
index 4c64d2cdedab..7fb7f1ede9e4 100644
--- a/example/neural_collaborative_filtering/convert.py
+++ b/example/neural_collaborative_filtering/convert.py
@@ -38,7 +38,7 @@ def parse_args():
     parser = ArgumentParser()
     parser.add_argument('--dataset', nargs='?', default='ml-20m', choices=['ml-1m', 'ml-20m'],
                         help='The dataset name, temporary support ml-1m and ml-20m.')
-    parser.add_argument('path', type=str, default = './data/',
+    parser.add_argument('--path', type=str, default = './data/',
                         help='Path to reviews CSV file from MovieLens')
     parser.add_argument('-n', '--negatives', type=int, default=999,
                         help='Number of negative samples for each positive'
diff --git a/example/neural_collaborative_filtering/core/model.py b/example/neural_collaborative_filtering/core/model.py
index b516e5039fed..6c03bb01a357 100644
--- a/example/neural_collaborative_filtering/core/model.py
+++ b/example/neural_collaborative_filtering/core/model.py
@@ -37,6 +37,27 @@ def _init_weight(self, _, arr):
         limit = np.sqrt(3. / self._fan_in)
         mx.random.uniform(-limit, limit, out=arr)
 
+# only for inference model optimize
+def mlp_opt(user, item, factor_size, model_layers, max_user, max_item):
+    user_weight = mx.sym.Variable('fused_mlp_user_weight', init=mx.init.Normal(0.01))
+    item_weight = mx.sym.Variable('fused_mlp_item_weight', init=mx.init.Normal(0.01))
+    embed_user = mx.sym.Embedding(data=user, weight=user_weight, input_dim=max_user,
+                                  output_dim=factor_size * 2, name='fused_embed_user'+str(factor_size))
+    embed_item = mx.sym.Embedding(data=item, weight=item_weight, input_dim=max_item,
+                                  output_dim=factor_size * 2, name='fused_embed_item'+str(factor_size))
+    pre_gemm_concat = embed_user + embed_item
+
+    for i in range(1, len(model_layers)):
+        if i==1:
+            pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
+            continue
+        else:
+            mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
+        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
+        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
+        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
+
+    return pre_gemm_concat
 
 def mlp(user, item, factor_size, model_layers, max_user, max_item):
     user_weight = mx.sym.Variable('mlp_user_weight', init=mx.init.Normal(0.01))
@@ -47,14 +68,11 @@ def mlp(user, item, factor_size, model_layers, max_user, max_item):
                                   output_dim=factor_size, name='embed_item'+str(factor_size))
     pre_gemm_concat = mx.sym.concat(embed_user, embed_item, dim=1, name='pre_gemm_concat')
 
-    for i, layer in enumerate(model_layers):
-        if i==0:
-            mlp_weight_init = golorot_uniform(2 * factor_size, model_layers[i])
-        else:
-            mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
-        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i), init=mlp_weight_init)
-        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=layer, name='fc_'+str(i))
-        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i))
+    for i in range(1, len(model_layers)):
+        mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
+        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
+        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
+        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
 
     return pre_gemm_concat
 
@@ -70,24 +88,34 @@ def gmf(user, item, factor_size, max_user, max_item):
     return pred
 
 def get_model(model_type='neumf', factor_size_mlp=128, factor_size_gmf=64,
-              model_layers=[256, 128, 64], num_hidden=1, 
-              max_user=138493, max_item=26744):
+              model_layers=[256, 256, 128, 64], num_hidden=1, 
+              max_user=138493, max_item=26744, opt=False):
     # input
     user = mx.sym.Variable('user')
     item = mx.sym.Variable('item')
 
     if model_type == 'mlp':
-        net = mlp(user=user, item=item,
-                  factor_size=factor_size_mlp, model_layers=model_layers,
-                  max_user=max_user, max_item=max_item)
+        if opt:
+            net = mlp_opt(user=user, item=item,
+                         factor_size=factor_size_mlp, model_layers=model_layers,
+                         max_user=max_user, max_item=max_item)
+        else:
+            net = mlp(user=user, item=item,
+                      factor_size=factor_size_mlp, model_layers=model_layers,
+                      max_user=max_user, max_item=max_item)
     elif model_type == 'gmf':
         net = gmf(user=user, item=item,
                   factor_size=factor_size_gmf,
                   max_user=max_user, max_item=max_item)
     elif model_type == 'neumf':
-        net_mlp = mlp(user=user, item=item,
-                      factor_size=factor_size_mlp, model_layers=model_layers,
-                      max_user=max_user, max_item=max_item)
+        if opt:
+            net_mlp = mlp_opt(user=user, item=item,
+                              factor_size=factor_size_mlp, model_layers=model_layers,
+                              max_user=max_user, max_item=max_item)
+        else:
+            net_mlp = mlp(user=user, item=item,
+                          factor_size=factor_size_mlp, model_layers=model_layers,
+                          max_user=max_user, max_item=max_item)
         net_gmf = gmf(user=user, item=item,
                       factor_size=factor_size_gmf,
                       max_user=max_user, max_item=max_item)
diff --git a/example/neural_collaborative_filtering/model_optimizer.py b/example/neural_collaborative_filtering/model_optimizer.py
new file mode 100644
index 000000000000..2866ae7e7e05
--- /dev/null
+++ b/example/neural_collaborative_filtering/model_optimizer.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# 
+import os
+import time
+import argparse
+import logging
+import math
+import random
+import numpy as np
+import mxnet as mx
+from core.model import get_model
+from core.dataset import NCFTrainData
+
+logging.basicConfig(level=logging.DEBUG)
+
+parser = argparse.ArgumentParser(description="Run model optimizer.",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--path', nargs='?', default='./data/',
+                    help='Input data path.')
+parser.add_argument('--dataset', nargs='?', default='ml-20m',
+                    help='The dataset name.')
+parser.add_argument('--model-prefix', type=str, default='./model/ml-20m/neumf')
+parser.add_argument('--epoch', type=int, default=7, help='parameters epoch')
+parser.add_argument('--model-type', type=str, default='neumf', choices=['neumf', 'gmf', 'mlp'],
+                    help="mdoel type")
+parser.add_argument('--layers', default='[256, 256, 128, 64]',
+                    help="list of number hiddens of fc layers in mlp model.")
+parser.add_argument('--factor-size-gmf', type=int, default=64,
+                    help="outdim of gmf embedding layers.")
+parser.add_argument('--num-hidden', type=int, default=1,
+                    help="num-hidden of neumf fc layer")
+
+head = '%(asctime)-15s %(message)s'
+logging.basicConfig(level=logging.INFO, format=head)
+
+# arg parser
+args = parser.parse_args()
+logging.info(args)
+
+model_prefix = args.model_prefix
+model_type = args.model_type
+model_layers = eval(args.layers)
+factor_size_gmf = args.factor_size_gmf
+factor_size_mlp = int(model_layers[0]/2)
+num_hidden = args.num_hidden
+train_dataset = NCFTrainData((args.path + args.dataset + '/train-ratings.csv'), nb_neg=4)
+net = get_model(model_type, factor_size_mlp, factor_size_gmf, 
+                model_layers, num_hidden, train_dataset.nb_users, train_dataset.nb_items, opt=True)
+
+raw_params, _ = mx.model.load_params(model_prefix, args.epoch)
+fc_0_weight_split = mx.nd.split(raw_params['fc_0_weight'], axis=1, num_outputs=2)
+fc_0_left = fc_0_weight_split[0]
+fc_0_right = fc_0_weight_split[1]
+
+user_weight_fusion = mx.nd.FullyConnected(data = raw_params['mlp_user_weight'], weight=fc_0_left, bias=raw_params['fc_0_bias'], no_bias=False, num_hidden=model_layers[0])
+item_weight_fusion = mx.nd.FullyConnected(data = raw_params['mlp_item_weight'], weight=fc_0_right, no_bias=True, num_hidden=model_layers[0])
+
+opt_params = raw_params
+del opt_params['mlp_user_weight']
+del opt_params['mlp_item_weight']
+del opt_params['fc_0_bias']
+opt_params['fused_mlp_user_weight'] = user_weight_fusion
+opt_params['fused_mlp_item_weight'] = item_weight_fusion
+
+mx.model.save_checkpoint(model_prefix + '-opt', args.epoch, net, opt_params, {})
+
diff --git a/example/neural_collaborative_filtering/ncf.py b/example/neural_collaborative_filtering/ncf.py
index 0fd9f733a1bd..b01be01bc8d9 100644
--- a/example/neural_collaborative_filtering/ncf.py
+++ b/example/neural_collaborative_filtering/ncf.py
@@ -42,20 +42,12 @@
                     help='max number of item index.')
 parser.add_argument('--batch-size', type=int, default=256,
                     help='number of examples per batch')
-parser.add_argument('--model-type', type=str, default='neumf', choices=['neumf', 'gmf', 'mlp'],
-                    help="mdoel type")
-parser.add_argument('--layers', default='[256, 128, 64]',
-                    help="list of number hiddens of fc layers in mlp model.")
-parser.add_argument('--factor-size-gmf', type=int, default=64,
-                    help="outdim of gmf embedding layers.")
-parser.add_argument('--num-hidden', type=int, default=1,
-                    help="num-hidden of neumf fc layer")
 parser.add_argument('--topk', type=int, default=10,
                     help="topk for accuracy evaluation.")
 parser.add_argument('--gpu', type=int, default=None,
                     help="index of gpu to run, e.g. 0 or 1. None means using cpu().")
 parser.add_argument('--benchmark', action='store_true',  help="whether to benchmark performance only")
-parser.add_argument('--epoch', type=int, default=0, help='model checkpoint index for inference')
+parser.add_argument('--epoch', type=int, default=7, help='model checkpoint index for inference')
 parser.add_argument('--prefix', default='./model/ml-20m/neumf', help="model checkpoint prefix")
 parser.add_argument('--calibration', action='store_true', help="whether to calibrate model")
 parser.add_argument('--calib-mode', type=str, choices=['naive', 'entropy'], default='naive',
@@ -85,11 +77,6 @@
     max_user = args.max_user
     max_item = args.max_item
     batch_size = args.batch_size
-    model_type = args.model_type
-    model_layers = eval(args.layers)
-    factor_size_gmf = args.factor_size_gmf
-    factor_size_mlp = int(model_layers[0]/2)
-    num_hidden = args.num_hidden
     benchmark = args.benchmark
     calibration = args.calibration
     calib_mode = args.calib_mode
@@ -129,7 +116,7 @@
         cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=net, arg_params=arg_params, aux_params=aux_params,
                                                                     excluded_sym_names=excluded_sym_names,
                                                                     calib_mode=calib_mode,
-                                                                    quantized_dtype=args.quantized_dtype, logger=logging)
+                                                                    quantized_dtype=quantized_dtype, logger=logging)
         max_num_examples = num_calib_batches * batch_size
         mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
         num_batches = 0
@@ -144,12 +131,17 @@
                     % (num_batches, batch_size))
         cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params,
                                                       collector=collector, calib_mode=calib_mode,
-                                                      quantized_dtype=args.quantized_dtype, logger=logging)                                                       
+                                                      quantized_dtype=quantized_dtype, logger=logging)                                                       
         sym_name = '%s-symbol.json' % (args.prefix + '-quantized')
         cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
         mx.model.save_checkpoint(args.prefix + '-quantized', args.epoch, cqsym, cqarg_params, aux_params)
     elif benchmark:
         logging.info('Benchmarking...')
+        data = [mx.random.randint(0, 1000, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+        batch = mx.io.DataBatch(data, []) # empty label
+        for i in range(2000):
+            mod.forward(batch, is_train=False)
+        logging.info('Benchmarking...')
         num_samples = 0
         for ib, batch in enumerate(val_iter):
             if ib == 5:
diff --git a/example/neural_collaborative_filtering/train.py b/example/neural_collaborative_filtering/train.py
index 0b0cfad1ef39..c68f271a6f0d 100644
--- a/example/neural_collaborative_filtering/train.py
+++ b/example/neural_collaborative_filtering/train.py
@@ -45,7 +45,7 @@
                     help="mdoel type")
 parser.add_argument('--num-negative', type=int, default=4,
                     help="number of negative samples per positive sample while training.")
-parser.add_argument('--layers', default='[256, 128, 64]',
+parser.add_argument('--layers', default='[256, 256, 128, 64]',
                     help="list of number hiddens of fc layers in mlp model.")
 parser.add_argument('--factor-size-gmf', type=int, default=64,
                     help="outdim of gmf embedding layers.")
diff --git a/example/quantization/README.md b/example/quantization/README.md
index 8cdc1bb7e06f..b934a811f31d 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -9,7 +9,7 @@ This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or
 
 <h2 id="1">Model Quantization with Intel® MKL-DNN</h2>
 
-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/tutorials/mkldnn/mkldnn_quantization.html).
+Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_quantization.html).
 
 ```
 usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH]
diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 0d0e3b64491b..6abb96c62c09 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -20,6 +20,107 @@
 """Doc placeholder for numpy ops with prefix _np."""
 
 
+def _np_all(a, axis=None, keepdims=False, out=None):
+    """
+    Test whether all array elements along a given axis evaluate to True.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical AND reduction is performed.
+        The default (axis = None) is to perform a logical AND over
+        all the dimensions of the input array.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+    out : ndarray, optional
+        Alternate output array in which to place the result. It must have
+        the same shape as the expected output and its type is preserved
+
+    Returns
+    --------
+    all : ndarray, bool 
+        A new boolean or array is returned unless out is specified,
+        in which case a reference to out is returned.
+
+    Examples:
+    ---------
+    >>> np.all([[True,False],[True,True]])
+    False
+
+    >>> np.all([[True,False],[True,True]], axis=0)
+    array([ True, False])
+
+    >>> np.all([-1, 4, 5])
+    True
+
+    >>> np.all([1.0, np.nan])
+    True
+
+    >>> o=np.array(False)
+    >>> z=np.all([-1, 4, 5], out=o)
+    >>> id(z), id(o), z
+    (28293632, 28293632, array(True)) # may vary  
+    """
+    pass
+
+def _np_any(a, axis=None, keepdims=False, out=None):
+    """
+    Test whether any array element along a given axis evaluates to True.
+    Returns single boolean unless axis is not None 
+
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical AND reduction is performed.
+        The default (axis = None) is to perform a logical AND over
+        all the dimensions of the input array.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+    out : ndarray, optional
+        Alternate output array in which to place the result. It must have
+        the same shape as the expected output and its type is preserved
+
+    Returns
+    --------
+    any : bool or ndarray
+        A new boolean or ndarray is returned unless out is specified,
+        in which case a reference to out is returned.
+
+    Examples:
+    ---------
+    >>> np.any([[True, False], [True, True]])
+    True
+
+    >>> np.any([[True, False], [False, False]], axis=0)
+    array([ True, False])
+
+    >>> np.any([-1, 0, 5])
+    True
+
+    >>> np.any(np.nan)
+    True
+
+    >>> o=np.array(False)
+    >>> z=np.any([-1, 4, 5], out=o)
+    >>> z, o
+    (array(True), array(True))
+    >>> # Check now that z is a reference to o
+    >>> z is o
+    True
+    >>> id(z), id(o) # identity of z and o              # doctest: +SKIP
+    (191614240, 191614240)
+    """
+    pass
+
+
 def _np_cumsum(a, axis=None, dtype=None, out=None):
     """
     Return the cumulative sum of the elements along a given axis.
@@ -630,7 +731,7 @@ def _np_squeeze(a, axis=None, out=None):
     pass
 
 
-def _np_max(a, axis=None, out=None, keepdims=False):
+def _np_max(a, axis=None, keepdims=False, out=None):
     """
     Return the maximum of an array or maximum along an axis.
     
@@ -694,7 +795,14 @@ def _np_max(a, axis=None, out=None, keepdims=False):
     pass
 
 
-def _np_min(a, axis=None, out=None, keepdims=False):
+def _np_amax(a, axis=None, keepdims=False, out=None):
+    """
+    Refer to _np_max
+    """
+    pass
+
+
+def _np_min(a, axis=None, keepdims=False, out=None):
     """
     Return the minimum of an array or minimum along an axis.
     
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 01051ab7c8e4..ce22fb753ace 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -27,6 +27,7 @@
 import logging
 import os
 import shutil
+import warnings
 import numpy as np
 from ..base import _LIB, check_call, py_str
 from ..base import c_array, c_str, mx_uint, c_str_array
@@ -419,6 +420,7 @@ def __init__(self, calib_data):
         else:
             data_example = [data_example]
         # suppose there must be one label in data_example
+        # TODO(xinyu-intel): little tricky here, need to refactor.
         num_data = len(data_example)
         assert num_data > 0
         # here reshape is to handle the 5D/6D input data
@@ -426,6 +428,10 @@ def __init__(self, calib_data):
             data_example[0] = data_example[0].reshape((-1,) + data_example[0].shape[2:])
         self.provide_data = [DataDesc(name='data', shape=(data_example[0].shape))]
         self.provide_data += [DataDesc(name='data{}'.format(i), shape=x.shape) for i, x in enumerate(data_example[1:])]
+        # data0, data1, ..., label
+        if num_data >= 3:
+            self.provide_data = [DataDesc(name='data{}'.format(i), shape=x.shape)
+                                 for i, x in enumerate(data_example[0:])]
         self.batch_size = data_example[0].shape[0]
         self.reset()
 
@@ -607,7 +613,9 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
         A tuple of quantized symbol, quantized arg_params, and aux_params.
     -------
     """
-    if ctx != cpu():
+    if not isinstance(ctx, Context):
+        raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
+    if ctx.device_type != 'cpu':
         raise ValueError(
             'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend')
 
@@ -627,8 +635,9 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
     return qsym, qarg_params, aux_params
 
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
-                   excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   quantized_dtype='int8', quantize_mode='full', logger=None):
+                   excluded_sym_names=None, excluded_op_names=None,
+                   calib_mode='entropy', quantized_dtype='int8', quantize_mode='full',
+                   LayerOutputCollector=None, logger=None):
     """User-level API for generating a quantized model from a FP32 model w/o calibration
     and a collector for naive or entropy calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -667,6 +676,8 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         The mode that quantization pass to apply. Support 'full' and 'smart'.
         'full' means quantize all operator if possible.
         'smart' means quantization pass will smartly choice which operator should be quantized.
+    LayerOutputCollector : class
+        For customize calibration method usage.
     logger : Object
         A logging object for printing information during the process of quantization.
     Returns
@@ -711,9 +722,14 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
             if logger:
                 logger.info(
                     'Create a layer output minmax collector for naive calibration')
+        elif calib_mode == 'customize' and LayerOutputCollector is not None:
+            collector = LayerOutputCollector
+            if logger:
+                logger.info(
+                    'Create a customize layer output minmax collector for calibration')
         else:
             raise ValueError('unknown calibration mode %s received,'
-                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
+                             ' expected `none`, `naive`, `entropy` or `customize`' % calib_mode)
         if logger:
             logger.info('Collector created, please use set_monitor_callback'
                         ' to collect calibration information.')
@@ -770,9 +786,11 @@ def calib_graph(qsym, arg_params, aux_params, collector,
                 collector.hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict = collector.min_max_dict
+        elif calib_mode == 'customize':
+            th_dict = collector.min_max_dict
         else:
             raise ValueError('unknown calibration mode %s received,'
-                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
+                             ' expected `none`, `naive`, `entropy` or `customize`' % calib_mode)
         qsym = _calibrate_quantized_sym(qsym, th_dict)
     else:
         raise ValueError('please set calibration mode to naive or entropy.')
@@ -783,10 +801,10 @@ def calib_graph(qsym, arg_params, aux_params, collector,
 
     return qsym, qarg_params, aux_params
 
-def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
-                 exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
-                 calib_data=None, data_shapes=None, calib_mode='none',
-                 num_calib_examples=None, ctx=cpu(), logger=None):
+def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full',
+                    exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
+                    calib_data=None, data_shapes=None, calib_mode='none',
+                    num_calib_examples=None, ctx=cpu(), LayerOutputCollector=None, logger=None):
     """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -830,6 +848,8 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
     ctx : Context
         Defines the device that users want to run forward propagation on the calibration
         dataset for collecting layer output statistics. Currently, only supports single context.
+    LayerOutputCollector : class
+        For customize calibration method usage.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -906,7 +926,8 @@ def __exit__(self, exc_type, exc_value, traceback):
     qsym, qarg_params, aux_params, collector = quantize_graph(
         sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
         excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
-        calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode, logger=logger)
+        calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
+        LayerOutputCollector=LayerOutputCollector, logger=logger)
 
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
@@ -915,7 +936,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         if calib_data is None:
             raise ValueError(
                 'calib_data must be provided when calib_mode=%s' % calib_mode)
-        if calib_mode in ['naive', 'entropy']:
+        if calib_mode in ['naive', 'entropy', 'customize']:
             data_names = [pair[0] for pair in calib_data.provide_data]
             mod = Module(symbol=symnet, context=ctx,
                          data_names=data_names, label_names=None)
@@ -956,3 +977,19 @@ def __exit__(self, exc_type, exc_value, traceback):
         net.collect_params().load(param_name, cast_dtype=True, dtype_source='saved')
         net.collect_params().reset_ctx(ctx)
     return net
+
+def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
+                 exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
+                 calib_data=None, data_shapes=None, calib_mode='none',
+                 num_calib_examples=None, ctx=cpu(), logger=None):
+    """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
+       Will be deprecated after MXNet 2.0, please use quantize_net_v2.
+    """
+    warnings.warn('WARNING: This will be deprecated after MXNet 2.0, please use quantize_net_v2.')
+    return quantize_net_v2(network=network, quantized_dtype=quantized_dtype,
+                           quantize_mode=quantize_mode, exclude_layers=exclude_layers,
+                           exclude_layers_match=exclude_layers_match,
+                           exclude_operators=exclude_operators,
+                           calib_data=calib_data, data_shapes=data_shapes,
+                           calib_mode=calib_mode, num_calib_examples=num_calib_examples,
+                           ctx=ctx, LayerOutputCollector=None, logger=logger)
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 3b0858007197..65d015b18c33 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -28,7 +28,7 @@
 from . import _internal as _npi
 from ..ndarray import NDArray
 
-__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like', 'invert',
+__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like', 'invert', 'delete',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
            'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs',
            'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2',
@@ -914,6 +914,67 @@ def mod(x1, x2, out=None, **kwargs):
     return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def delete(arr, obj, axis=None):
+    """
+    Return a new array with sub-arrays along an axis deleted. For a one
+    dimensional array, this returns those entries not returned by
+    `arr[obj]`.
+
+    Parameters
+    ----------
+    arr : ndarray
+      Input array.
+    obj : slice, int or ndarray of ints
+      Indicate indices of sub-arrays to remove along the specified axis.
+    axis : int, optional
+      The axis along which to delete the subarray defined by `obj`.
+      If `axis` is None, `obj` is applied to the flattened array.
+
+    Returns
+    -------
+    out : ndarray
+        A copy of `arr` with the elements specified by `obj` removed. Note
+        that `delete` does not occur in-place. If `axis` is None, `out` is
+        a flattened array.
+
+    Examples
+    --------
+    >>> arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
+    >>> arr
+    array([[ 1.,  2.,  3.,  4.],
+           [ 5.,  6.,  7.,  8.],
+           [ 9., 10., 11., 12.]])
+
+    >>> np.delete(arr, 1, 0)
+    array([[ 1.,  2.,  3.,  4.],
+           [ 9., 10., 11., 12.]])
+
+    >>> np.delete(arr, slice(None, None, 2), 1)
+    array([[ 2.,  4.],
+           [ 6.,  8.],
+           [10., 12.]])
+
+    >>> np.delete(arr, np.array([1,3,5]), None)
+    array([ 1.,  3.,  5.,  7.,  8.,  9., 10., 11., 12.])
+    >>> np.delete(arr, np.array([1,1,5]), None)
+    array([ 1.,  3.,  4.,  5.,  7.,  8.,  9., 10., 11., 12.])
+    """
+    if not isinstance(arr, NDArray):
+        raise TypeError("'arr' can not support type {}".format(str(type(arr))))
+    if isinstance(obj, slice):
+        start = obj.start
+        stop = obj.stop
+        step = 1 if obj.step is None else obj.step
+        return _npi.delete(arr, start=start, stop=stop, step=step, axis=axis)
+    elif isinstance(obj, integer_types):
+        return _npi.delete(arr, int_ind=obj, axis=axis)
+    elif isinstance(obj, NDArray):
+        return _npi.delete(arr, obj, axis=axis)
+    else:
+        raise TypeError("'obj' can not support type {}".format(str(type(obj))))
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def remainder(x1, x2, out=None):
@@ -4783,6 +4844,25 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.ndarray.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    from ...numpy import ndarray
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, ndarray):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index 4c49c35b4a44..e4fee158bea4 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -21,7 +21,7 @@
 from . import _op as _mx_nd_np
 from . import _internal as _npi
 
-__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv']
+__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve']
 
 
 def norm(x, ord=None, axis=None, keepdims=False):
@@ -461,3 +461,51 @@ def tensorinv(a, ind=2):
     True
     """
     return _npi.tensorinv(a, ind)
+
+
+def tensorsolve(a, b, axes=None):
+    r"""
+    Solve the tensor equation ``a x = b`` for x.
+    It is assumed that all indices of `x` are summed over in the product,
+    together with the rightmost indices of `a`, as is done in, for example,
+    ``tensordot(a, x, axes=b.ndim)``.
+
+    Parameters
+    ----------
+    a : ndarray
+        Coefficient tensor, of shape ``b.shape + Q``. `Q`, a tuple, equals
+        the shape of that sub-tensor of `a` consisting of the appropriate
+        number of its rightmost indices, and must be such that
+        ``prod(Q) == prod(b.shape)`` (in which sense `a` is said to be
+        'square').
+    b : ndarray
+        Right-hand tensor, which can be of any shape.
+    axes : tuple of ints, optional
+        Axes in `a` to reorder to the right, before inversion.
+        If None (default), no reordering is done.
+
+    Returns
+    -------
+    x : ndarray, shape Q
+
+    Raises
+    ------
+    MXNetError
+        If `a` is singular or not 'square' (in the above sense).
+
+    See Also
+    --------
+    numpy.tensordot, tensorinv, numpy.einsum
+
+    Examples
+    --------
+    >>> a = np.eye(2*3*4)
+    >>> a.shape = (2*3, 4, 2, 3, 4)
+    >>> b = np.random.randn(2*3, 4)
+    >>> x = np.linalg.tensorsolve(a, b)
+    >>> x.shape
+    (2, 3, 4)
+    >>> np.allclose(np.tensordot(a, x, axes=3), b)
+    True
+    """
+    return _npi.tensorsolve(a, b, axes)
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 2ee2d2670693..96fe1d311028 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv']
+__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve']
 
 
 def norm(x, ord=None, axis=None, keepdims=False):
@@ -479,3 +479,51 @@ def tensorinv(a, ind=2):
     True
     """
     return _mx_nd_np.linalg.tensorinv(a, ind)
+
+
+def tensorsolve(a, b, axes=None):
+    r"""
+    Solve the tensor equation ``a x = b`` for x.
+    It is assumed that all indices of `x` are summed over in the product,
+    together with the rightmost indices of `a`, as is done in, for example,
+    ``tensordot(a, x, axes=b.ndim)``.
+
+    Parameters
+    ----------
+    a : ndarray
+        Coefficient tensor, of shape ``b.shape + Q``. `Q`, a tuple, equals
+        the shape of that sub-tensor of `a` consisting of the appropriate
+        number of its rightmost indices, and must be such that
+        ``prod(Q) == prod(b.shape)`` (in which sense `a` is said to be
+        'square').
+    b : ndarray
+        Right-hand tensor, which can be of any shape.
+    axes : tuple of ints, optional
+        Axes in `a` to reorder to the right, before inversion.
+        If None (default), no reordering is done.
+
+    Returns
+    -------
+    x : ndarray, shape Q
+
+    Raises
+    ------
+    MXNetError
+        If `a` is singular or not 'square' (in the above sense).
+
+    See Also
+    --------
+    numpy.tensordot, tensorinv, numpy.einsum
+
+    Examples
+    --------
+    >>> a = np.eye(2*3*4)
+    >>> a.shape = (2*3, 4, 2, 3, 4)
+    >>> b = np.random.randn(2*3, 4)
+    >>> x = np.linalg.tensorsolve(a, b)
+    >>> x.shape
+    (2, 3, 4)
+    >>> np.allclose(np.tensordot(a, x, axes=3), b)
+    True
+    """
+    return _mx_nd_np.linalg.tensorsolve(a, b, axes)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 5a4035e15223..3eab52596e0d 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,7 +47,7 @@
 from ..ndarray.ndarray import _storage_type
 
 __all__ = ['ndarray', 'empty', 'array', 'shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
-           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not', 'delete',
            'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'invert',
            'sqrt', 'cbrt', 'abs', 'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'histogram',
@@ -1377,7 +1377,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -1558,13 +1558,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -5887,6 +5887,55 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
     return _npi.std(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
 
 
+@set_module('mxnet.numpy')
+def delete(arr, obj, axis=None):
+    """
+    Return a new array with sub-arrays along an axis deleted. For a one
+    dimensional array, this returns those entries not returned by
+    `arr[obj]`.
+
+    Parameters
+    ----------
+    arr : ndarray
+      Input array.
+    obj : slice, int or ndarray of ints
+      Indicate indices of sub-arrays to remove along the specified axis.
+    axis : int, optional
+      The axis along which to delete the subarray defined by `obj`.
+      If `axis` is None, `obj` is applied to the flattened array.
+
+    Returns
+    -------
+    out : ndarray
+        A copy of `arr` with the elements specified by `obj` removed. Note
+        that `delete` does not occur in-place. If `axis` is None, `out` is
+        a flattened array.
+
+    Examples
+    --------
+    >>> arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
+    >>> arr
+    array([[ 1.,  2.,  3.,  4.],
+           [ 5.,  6.,  7.,  8.],
+           [ 9., 10., 11., 12.]])
+
+    >>> np.delete(arr, 1, 0)
+    array([[ 1.,  2.,  3.,  4.],
+           [ 9., 10., 11., 12.]])
+
+    >>> np.delete(arr, slice(None, None, 2), 1)
+    array([[ 2.,  4.],
+           [ 6.,  8.],
+           [10., 12.]])
+
+    >>> np.delete(arr, np.array([1,3,5]), None)
+    array([ 1.,  3.,  5.,  7.,  8.,  9., 10., 11., 12.])
+    >>> np.delete(arr, np.array([1,1,5]), None)
+    array([ 1.,  3.,  4.,  5.,  7.,  8.,  9., 10., 11., 12.])
+    """
+    return _mx_nd_np.delete(arr, obj, axis=axis)
+
+
 @set_module('mxnet.numpy')
 def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint: disable=too-many-arguments
     """
@@ -6495,6 +6544,19 @@ def around(x, decimals=0, out=None, **kwargs):
     return _mx_nd_np.around(x, decimals, out=out, **kwargs)
 
 
+@set_module('mxnet.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    return _mx_nd_np.around(x, decimals, out=out, **kwargs)
+
+
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index ebc24de63282..95719a005cec 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle"]
+__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle", "randn"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -357,3 +357,44 @@ def shuffle(x):
            [0., 1., 2.]])
     """
     _mx_nd_np.random.shuffle(x)
+
+
+def randn(*size, **kwargs):
+    r"""Return a sample (or samples) from the "standard normal" distribution.
+    If positive, int_like or int-convertible arguments are provided,
+    `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled
+    with random floats sampled from a univariate "normal" (Gaussian)
+    distribution of mean 0 and variance 1 (if any of the :math:`d_i` are
+    floats, they are first converted to integers by truncation). A single
+    float randomly sampled from the distribution is returned if no
+    argument is provided.
+    This is a convenience function.  If you want an interface that takes a
+    tuple as the first argument, use `numpy.random.standard_normal` instead.
+    Parameters
+    ----------
+    d0, d1, ..., dn : int, optional
+        The dimensions of the returned array, should be all positive.
+        If no argument is given a single Python float is returned.
+    Returns
+    -------
+    Z : ndarray
+        A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from
+        the standard normal distribution, or a single such float if
+        no parameters were supplied.
+    Notes
+    -----
+    For random samples from :math:`N(\mu, \sigma^2)`, use:
+    ``sigma * np.random.randn(...) + mu``
+    Examples
+    --------
+    >>> np.random.randn()
+    2.1923875335537315 #random
+    Two-by-four array of samples from N(3, 6.25):
+    >>> 2.5 * np.random.randn(2, 4) + 3
+    array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],  #random
+        [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]]) #random
+    """
+    output_shape = ()
+    for s in size:
+        output_shape += (s,)
+    return _mx_nd_np.random.normal(0, 1, size=output_shape, **kwargs)
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index c7e9dd1398eb..65486e6e5f37 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -83,9 +83,12 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
 
 
 _NUMPY_ARRAY_FUNCTION_LIST = [
+    'all',
+    'any',
     'argmin',
     'argmax',
     'around',
+    'round',
     'argsort',
     'append',
     'broadcast_arrays',
@@ -103,6 +106,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'flip',
     'inner',
     'max',
+    'amax',
     'mean',
     'min',
     'nonzero',
@@ -125,6 +129,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'transpose',
     'unique',
     'unravel_index',
+    'delete',
     'var',
     'vdot',
     'vstack',
@@ -135,6 +140,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'linalg.inv',
     'linalg.solve',
     'linalg.tensorinv',
+    'linalg.tensorsolve',
     'shape',
     'trace',
     'tril',
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 86eab23ddfea..6f9d32abd336 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -36,7 +36,7 @@
 except ImportError:
     from builtins import slice as py_slice
 
-__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like', 'bitwise_not', 'invert',
+__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like', 'bitwise_not', 'invert', 'delete',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'arctan2',
            'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs', 'absolute', 'exp',
            'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2', 'log1p',
@@ -492,7 +492,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -666,13 +666,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -3162,6 +3162,45 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
     return _npi.arange(start=start, stop=stop, step=step, dtype=dtype, ctx=ctx)
 
 
+@set_module('mxnet.symbol.numpy')
+def delete(arr, obj, axis=None):
+    """
+    Return a new array with sub-arrays along an axis deleted. For a one
+    dimensional array, this returns those entries not returned by
+    `arr[obj]`.
+
+    Parameters
+    ----------
+    arr : _Symbol
+      Input array.
+    obj : slice, scaler or _Symbol of ints
+      Indicate indices of sub-arrays to remove along the specified axis.
+    axis : scaler, optional
+      The axis along which to delete the subarray defined by `obj`.
+      If `axis` is None, `obj` is applied to the flattened array.
+
+    Returns
+    -------
+    out : _Symbol
+        A copy of `arr` with the elements specified by `obj` removed. Note
+        that `delete` does not occur in-place. If `axis` is None, `out` is
+        a flattened array.
+    """
+    if not isinstance(arr, Symbol):
+        raise TypeError("'arr' can not support type {}".format(str(type(arr))))
+    if isinstance(obj, slice):
+        start = obj.start
+        stop = obj.stop
+        step = 1 if obj.step is None else obj.step
+        return _npi.delete(arr, start=start, stop=stop, step=step, axis=axis)
+    elif isinstance(obj, integer_types):
+        return _npi.delete(arr, int_ind=obj, axis=axis)
+    elif isinstance(obj, Symbol):
+        return _npi.delete(arr, obj, axis=axis)
+    else:
+        raise TypeError("'obj' can not support type {}".format(str(type(obj))))
+
+
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.symbol.numpy')
 def split(ary, indices_or_sections, axis=0):
@@ -4554,6 +4593,24 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.symbol.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, _Symbol):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.symbol.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index a445c79001ec..0bfbb6ee540f 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -22,7 +22,7 @@
 from . import _op as _mx_sym_np
 from . import _internal as _npi
 
-__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv']
+__all__ = ['norm', 'svd', 'cholesky', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve']
 
 
 def norm(x, ord=None, axis=None, keepdims=False):
@@ -448,3 +448,51 @@ def tensorinv(a, ind=2):
     True
     """
     return _npi.tensorinv(a, ind)
+
+
+def tensorsolve(a, b, axes=None):
+    r"""
+    Solve the tensor equation ``a x = b`` for x.
+    It is assumed that all indices of `x` are summed over in the product,
+    together with the rightmost indices of `a`, as is done in, for example,
+    ``tensordot(a, x, axes=b.ndim)``.
+
+    Parameters
+    ----------
+    a : ndarray
+        Coefficient tensor, of shape ``b.shape + Q``. `Q`, a tuple, equals
+        the shape of that sub-tensor of `a` consisting of the appropriate
+        number of its rightmost indices, and must be such that
+        ``prod(Q) == prod(b.shape)`` (in which sense `a` is said to be
+        'square').
+    b : ndarray
+        Right-hand tensor, which can be of any shape.
+    axes : tuple of ints, optional
+        Axes in `a` to reorder to the right, before inversion.
+        If None (default), no reordering is done.
+
+    Returns
+    -------
+    x : ndarray, shape Q
+
+    Raises
+    ------
+    MXNetError
+        If `a` is singular or not 'square' (in the above sense).
+
+    See Also
+    --------
+    numpy.tensordot, tensorinv, numpy.einsum
+
+    Examples
+    --------
+    >>> a = np.eye(2*3*4)
+    >>> a.shape = (2*3, 4, 2, 3, 4)
+    >>> b = np.random.randn(2*3, 4)
+    >>> x = np.linalg.tensorsolve(a, b)
+    >>> x.shape
+    (2, 3, 4)
+    >>> np.allclose(np.tensordot(a, x, axes=3), b)
+    True
+    """
+    return _npi.tensorsolve(a, b, axes)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index e3a3c0443428..4176d3a68792 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -1101,6 +1101,14 @@ struct minimum : public mxnet_op::tunable {
   }
 };
 
+/*! \brief boolean any/all kernel that determines whether elem is NonZero */
+struct NonZero {
+  template<typename DType>
+  MSHADOW_XINLINE static bool Map(DType a) {
+    return (a != DType(0));
+  }
+};
+
 /*! \brief sum reducer that ignores NaN values in the input */
 struct nansum {
   /*! \brief do reduction into dst */
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index b15117f9f83b..d7752c4759db 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -1148,6 +1148,29 @@ struct set_to_int : public tunable {
  */
 using set_zero = set_to_int<0>;
 using set_one  = set_to_int<1>;
+
+/*!
+ * \brief Set to immediate scalar value kernel
+ * \tparam val Scalar immediate
+ */
+template<bool val>
+struct set_to_bool : public tunable {
+  // mxnet_op version (when used directly with Kernel<>::Launch()) */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
+    out[i] = DType(val);
+  }
+  // mshadow_op version (when used with op_with_req<>)
+  MSHADOW_XINLINE static int Map() {
+    return val;
+  }
+};
+
+/*!
+ * \brief Special-case kernel shortcut for setting to true and false
+ */
+using set_true = set_to_bool<true>;
+using set_false = set_to_bool<false>;
 }  // namespace mxnet_op
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_tensorsolve-inl.h b/src/operator/numpy/linalg/np_tensorsolve-inl.h
new file mode 100644
index 000000000000..829a119b64a2
--- /dev/null
+++ b/src/operator/numpy/linalg/np_tensorsolve-inl.h
@@ -0,0 +1,557 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file np_tensorsolve-inl.h
+ * \brief Placeholder for tensor solve
+ */
+#ifndef MXNET_OPERATOR_NUMPY_LINALG_NP_TENSORSOLVE_INL_H_
+#define MXNET_OPERATOR_NUMPY_LINALG_NP_TENSORSOLVE_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../operator_common.h"
+#include "../../mshadow_op.h"
+#include "../../tensor/la_op.h"
+#include "../../tensor/la_op-inl.h"
+#include "../np_tensordot_op-inl.h"
+#include "./np_solve-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow;
+
+struct TensorsolveParam : public dmlc::Parameter<TensorsolveParam> {
+  mxnet::Tuple<int> a_axes;
+  DMLC_DECLARE_PARAMETER(TensorsolveParam) {
+    DMLC_DECLARE_FIELD(a_axes)
+    .set_default(mxnet::Tuple<int>())
+    .describe("Tuple of ints, optional. Axes in a to reorder to the right, before inversion.");
+  }
+};
+
+// Fix negative axes.
+inline void FixNegativeAxes(mxnet::Tuple<int> *a_axes_param,
+                            const mxnet::TShape& a_shape) {
+  if (-1 == a_axes_param->ndim()) { return; }
+  const int a_ndim = a_shape.ndim();
+  for (auto& i : *a_axes_param) {
+    i = (i + a_ndim) % a_ndim;
+  }
+}
+
+// Get remained axes and axes of a.
+inline void GetReorderedAxes(const mxnet::Tuple<int>& a_axes_param,
+                             mxnet::Tuple<int> *a_axes_remained,
+                             mxnet::Tuple<int> *a_axes,
+                             const mxnet::TShape& a_shape) {
+  std::vector<int> a_axes_vec;
+  for (int i = 0; i < a_shape.ndim(); ++i) {
+    a_axes_vec.push_back(i);
+  }
+  // Get remained axes and axes.
+  if (-1 == a_axes_param.ndim()) {
+    *a_axes_remained = mxnet::Tuple<int>(a_axes_vec);
+    *a_axes = mxnet::Tuple<int>(a_axes_vec);
+    return;
+  }
+  for (const auto& i : a_axes_param) {
+    a_axes_vec.erase(std::find(a_axes_vec.begin(), a_axes_vec.end(), i));
+  }
+  *a_axes_remained = mxnet::Tuple<int>(a_axes_vec);
+
+  a_axes_vec.clear();
+  for (const auto& i : *a_axes_remained) {
+    a_axes_vec.push_back(i);
+  }
+  for (const auto& i : a_axes_param) {
+    a_axes_vec.push_back(i);
+  }
+  *a_axes = mxnet::Tuple<int>(a_axes_vec);
+}
+
+// Calculate output shape if a and b is tensor
+inline mxnet::TShape GetOutShape(const mxnet::TShape& a_shape,
+                                 const mxnet::TShape& b_shape) {
+  const int a_ndim = a_shape.ndim(), b_ndim = b_shape.ndim();
+  const int temp = a_ndim > b_ndim ? b_ndim : b_ndim - a_ndim;
+  mxnet::TShape out_shape(a_ndim - temp, -1);
+  for (int i = temp; i < a_ndim; ++i) {
+    out_shape[i - temp] = a_shape[i];
+  }
+  return out_shape;
+}
+
+// Calculates workspace size of tensorsolve forward.
+template<typename xpu>
+size_t TensorsolveForwardWorkspaceSize(const Tuple<int>& a_axes_param,
+                                       const TBlob& a,
+                                       const TBlob& b,
+                                       const TBlob& out,
+                                       const std::vector<OpReqType>& req) {
+  if (kNullOp == req[0]) { return 0U; }
+
+  // Zero-size output, no need to launch kernel
+  if (0U == out.shape_.Size()) { return 0U; }
+
+  const mxnet::TShape& a_shape = a.shape_;
+  const mxnet::TShape& b_shape = b.shape_;
+  MSHADOW_SGL_DBL_TYPE_SWITCH(out.type_flag_, DType, {
+    if (0U == a_shape.Size() || 0U == b_shape.Size()) {
+      // 0-size input
+      return 0U;
+    } else if (0 == a_shape.ndim() || 0 == b_shape.ndim()) {
+      // At least 1 scalar.
+      return (a.Size() + b.Size()) * sizeof(DType) + b.Size() * sizeof(int);
+    } else {
+      // Two tensors of at least 1 dimensions.
+      return (2 * a.Size() + b.Size()) * sizeof(DType) + b.Size() * sizeof(int);
+    }
+  });
+  LOG(FATAL) << "InternalError: cannot reach here";
+  return 0U;
+}
+
+template<int req>
+struct assign_helper {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data, DType *out_data) {
+    KERNEL_ASSIGN(out_data[i], req, in_data[i]);
+  }
+};
+
+struct tensorsolve {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 2, DType>& A,
+                 const Tensor<xpu, 2, DType>& X,
+                 const Tensor<xpu, 1, int>& ipiv,
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    linalg_solve(A, X, ipiv, s);  // ipiv for work_space in Lapacke_#gesv
+  }
+};
+
+template<typename xpu, typename laop>
+void TensorsolveOpForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& a = inputs[0];
+  const TBlob& b = inputs[1];
+  const TBlob& out = outputs[0];
+  const mxnet::TShape a_shape = a.shape_;
+  const mxnet::TShape b_shape = b.shape_;
+  const mxnet::TShape out_shape = out.shape_;
+  const TensorsolveParam& param = nnvm::get<TensorsolveParam>(attrs.parsed);
+  mxnet::Tuple<int> a_axes_param = param.a_axes;
+  FixNegativeAxes(&a_axes_param, a_shape);
+
+  size_t workspace_size = TensorsolveForwardWorkspaceSize<xpu>(a_axes_param, a, b, out, req);
+  Tensor<xpu, 1, char> workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
+    Shape1(workspace_size), ctx.get_stream<xpu>());
+
+  if (kNullOp == req[0]) { return; }
+
+  // Zero-size output, no need to launch kernel
+  if (0U == out.shape_.Size()) { return; }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(out.type_flag_, DType, {
+    if (0U == a_shape.Size() || 0U == b_shape.Size()) {  // 0-size input
+      if (req[0] != kAddTo) {
+        Tensor<xpu, 1, DType> out_tensor =
+          out.get_with_shape<xpu, 1, DType>(Shape1(out.shape_.Size()), s);
+        out_tensor = static_cast<DType>(0);
+      }
+    } else if (0U == a_shape.ndim() || 0U ==  b_shape.ndim()) {  // At least 1 scalar.
+      // Check again
+      CHECK_EQ(a_shape.Size(), 1U)
+        << "a's and b's dimensions don't match";
+      CHECK_EQ(b_shape.Size(), 1U)
+        << "a's and b's dimensions don't match";
+
+      DType* a_ptr =
+        reinterpret_cast<DType*>(workspace.dptr_);
+      DType* b_ptr =
+        reinterpret_cast<DType*>(workspace.dptr_+ a.Size() * sizeof(DType));
+      int* ipiv_ptr =
+        reinterpret_cast<int*>(workspace.dptr_ + (a.Size() + b.Size()) * sizeof(DType));
+
+      // Cast type
+      MSHADOW_TYPE_SWITCH(a.type_flag_, AType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, a_shape.Size(), a_ptr, a.dptr<AType>());
+      });
+      MSHADOW_TYPE_SWITCH(b.type_flag_, BType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, b_shape.Size(), b_ptr, b.dptr<BType>());
+      });
+
+      mxnet::TBlob a_tblob(a_ptr, Shape2(1, 1), a.dev_mask(), a.dev_id());
+      mxnet::TBlob b_tblob(b_ptr, Shape2(1, 1), b.dev_mask(), b.dev_id());
+      mxnet::TBlob ipiv_tblob(ipiv_ptr, Shape1(1), out.dev_mask(), out.dev_id());
+      Tensor<xpu, 2, DType> a_tensor = a_tblob.get<xpu, 2, DType>(s);
+      Tensor<xpu, 2, DType> b_tensor = b_tblob.get<xpu, 2, DType>(s);
+      Tensor<xpu, 1, int> ipiv_tensor = ipiv_tblob.get<xpu, 1, int>(s);
+
+      // Solve linear equation
+      laop::op(a_tensor, b_tensor, ipiv_tensor, ctx);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+          mxnet_op::Kernel<assign_helper<req_type>, xpu>::Launch(
+            s, out_shape.Size(), b_tensor.dptr_, out.dptr<DType>());
+      });
+    } else {
+      // Two tensors of at least 1 dimensions.
+      Tuple<int> a_axes_remained;
+      Tuple<int> a_axes;
+      GetReorderedAxes(a_axes_param, &a_axes_remained, &a_axes, a_shape);
+      mxnet::TShape a_transpose_shape = GetReorderedShape(a_shape, a_axes);
+      const int N = b_shape.Size();
+
+      DType* a_ptr =
+        reinterpret_cast<DType*>(workspace.dptr_);
+      DType* a_trans_ptr =
+        reinterpret_cast<DType*>(workspace.dptr_ + a.Size() * sizeof(DType));
+      DType* b_ptr =
+        reinterpret_cast<DType*>(workspace.dptr_ + 2 * a.Size() * sizeof(DType));
+      int* ipiv_ptr =
+        reinterpret_cast<int*>(workspace.dptr_ + (2 * a.Size() + b.Size()) * sizeof(DType));
+
+      // Cast type
+      MSHADOW_TYPE_SWITCH(a.type_flag_, AType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, a_shape.Size(), a_ptr, a.dptr<AType>());
+      });
+      // Cast type
+      MSHADOW_TYPE_SWITCH(b.type_flag_, BType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, b_shape.Size(), b_ptr, b.dptr<BType>());
+      });
+
+      mxnet::TBlob a_tblob =
+        TBlob(a_ptr, a_shape, a.dev_mask(), a.dev_id());
+      mxnet::TBlob a_transpose_tblob =
+        TBlob(a_trans_ptr, a_transpose_shape, a.dev_mask(), a.dev_id());
+      mxnet::TBlob b_tblob =
+        TBlob(b_ptr, b_shape, b.dev_mask(), b.dev_id());
+      mxnet::TBlob ipiv_tblob =
+        TBlob(ipiv_ptr, b_shape, out.dev_mask(), out.dev_id());
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx,
+                                    a_tblob,            // src
+                                    a_transpose_tblob,  // res
+                                    mxnet::TShape(a_axes.begin(), a_axes.end()));
+
+      Tensor<xpu, 2, DType> a_tensor =
+        a_tblob.get_with_shape<xpu, 2, DType>(Shape2(N, N), s);
+      Tensor<xpu, 1, int> ipiv_tensor =
+        ipiv_tblob.get_with_shape<xpu, 1, int>(Shape1(N), s);
+      Tensor<xpu, 2, DType> b_tensor =
+        b_tblob.get_with_shape<xpu, 2, DType>(Shape2(1, N), s);
+      Tensor<xpu, 2, DType> out_tensor =
+        out.get_with_shape<xpu, 2, DType>(Shape2(1, N), s);
+
+      a_tblob = a_tblob.reshape(Shape2(N, N));
+      a_transpose_tblob = a_transpose_tblob.reshape(Shape2(N, N));
+      Tuple<int> a_axes_2D(std::vector<int>{1, 0});
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx,
+                                    a_transpose_tblob,  // src
+                                    a_tblob,            // res
+                                    mxnet::TShape(a_axes_2D.begin(), a_axes_2D.end()));
+      // Solve linear equation
+      laop::op(a_tensor, b_tensor, ipiv_tensor, ctx);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        mxnet_op::Kernel<assign_helper<req_type>, xpu>::Launch(
+          s, out_shape.Size(), b_tensor.dptr_, out_tensor.dptr_);
+      });
+    }
+  });
+}
+
+// Calculates workspace size of tensordot backward.
+template<typename xpu>
+size_t TensorsolveBackwardWorkspaceSize(const TBlob& out_grad,
+                                        const TBlob& a,
+                                        const TBlob& b,
+                                        const TBlob& x) {
+  const mxnet::TShape& a_shape = a.shape_;
+  const mxnet::TShape& b_shape = b.shape_;
+  const mxnet::TShape& x_shape = x.shape_;
+
+  // Zero-size output, no need to launch kernel
+  if (0U == a_shape.Size() || 0U == b_shape.Size()) { return 0U; }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    int work_space_size = 0;
+    if (0U == a_shape.ndim() || 0U == b_shape.ndim()) {
+      // At least 1 scalar.
+      work_space_size += sizeof(DType) * a_shape.Size();  // for tensorinv(a)
+      work_space_size += sizeof(DType) * a_shape.Size();  // for getri work space lu
+      work_space_size += sizeof(int) * b_shape.Size();    // for getri work space pivot
+    } else {
+      // Two tensors of at least 1 dimensions.
+      work_space_size += sizeof(DType) * a_shape.Size();  // for tensorinv(a)
+      work_space_size += sizeof(DType) * a_shape.Size();  // for getri work space lu
+      work_space_size += sizeof(DType) * b_shape.Size();  // for b
+      work_space_size += sizeof(DType) * x_shape.Size();  // for x
+      work_space_size += sizeof(DType) * a_shape.Size();  // for grad_a
+      work_space_size += sizeof(DType) * b_shape.Size();  // for grad_b
+      work_space_size += sizeof(int) * b_shape.Size();    // for getri work space pivot
+    }
+    return work_space_size;
+  });
+  LOG(FATAL) << "InternalError: cannot reach here";
+  return 0U;
+}
+
+// Get original axes for tensor a.
+inline void GetOriginAxes(const mxnet::TShape& a_shape,
+                          const mxnet::Tuple<int>& a_axes,
+                          mxnet::Tuple<int> *a_origin_axes) {
+  std::vector<int> a_origin_axes_vec(a_shape.ndim(), -1);
+  for (int i = 0; i < a_shape.ndim(); ++i) {
+    a_origin_axes_vec[a_axes[i]] = i;
+  }
+  *a_origin_axes = mxnet::Tuple<int>(a_origin_axes_vec);
+}
+
+struct tensorsolve_backward {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dX,
+                 const Tensor<xpu, 3, DType>& inv_A,
+                 const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& X,
+                 const Tensor<xpu, 3, DType>& dA,
+                 const Tensor<xpu, 3, DType>& dB,
+                 const OpContext& ctx) {
+    // (1) calcualte dB = trans(tensorinv(A)) * dX
+    // (2) calcualte dA = dB * trans(X)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    gemm2::op(inv_A, dX, dB, DType(1), true, false, s);
+    gemm2::op(dB, X, dA, DType(-1), false, true, s);
+  }
+};
+
+template<typename xpu, typename laop>
+void TensorsolveBackwardImpl(const Tuple<int>& a_axes_param,
+                             const TBlob& out_grad,
+                             const TBlob& a,
+                             const TBlob& b,
+                             const TBlob& x,
+                             const TBlob& grad_a,
+                             const TBlob& grad_b,
+                             const OpContext& ctx,
+                             const std::vector<OpReqType>& req,
+                             const Tensor<xpu, 1, char>& workspace) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const mxnet::TShape& a_shape = a.shape_;
+  const mxnet::TShape& b_shape = b.shape_;
+  const mxnet::TShape& x_shape = x.shape_;
+
+  if (kNullOp == req[0] && kNullOp == req[1]) { return; }
+
+  // Zero-size output, no need to launch kernel
+  if (0U == a_shape.Size() || 0U == b_shape.Size()) { return; }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    if (0 == a_shape.ndim() || 0 == b_shape.ndim()) {
+      // At least 1 scalar.
+      CHECK_EQ(a_shape.Size(), 1U)
+        << "a's and b's dimensions don't match";
+      CHECK_EQ(b_shape.Size(), 1U)
+        << "a's and b's dimensions don't match";
+
+      // Allocate workspace.
+      DType *tensorinv_a_ptr = reinterpret_cast<DType*>(workspace.dptr_);
+      DType *lu_ptr = reinterpret_cast<DType*>(workspace.dptr_ + a_shape.Size() * sizeof(DType));
+      int *ipiv_ptr = reinterpret_cast<int*>(workspace.dptr_ + 2 * a_shape.Size() * sizeof(DType));
+      TBlob tensorinv_a(tensorinv_a_ptr, a_shape, xpu::kDevMask);
+      TBlob lu(lu_ptr, a_shape, xpu::kDevMask);
+      TBlob ipiv(ipiv_ptr, b_shape, xpu::kDevMask);
+
+      MSHADOW_TYPE_SWITCH(a.type_flag_, AType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, a_shape.Size(),
+          tensorinv_a_ptr,
+          a.dptr<AType>());
+      });
+      // Calculate tensorinv(a)
+      Tensor<xpu, 3, DType> tensorinv_a_tensor =
+        tensorinv_a.get_with_shape<xpu, 3, DType>(Shape3(1, 1, 1), s);
+      Tensor<xpu, 3, DType> lu_tensor =
+        lu.get_with_shape<xpu, 3, DType>(Shape3(1, 1, 1), s);
+      Tensor<xpu, 2, int> ipiv_tensor =
+        ipiv.get_with_shape<xpu, 2, int>(Shape2(1, 1), s);
+      batch_inverse(tensorinv_a_tensor, lu_tensor, ipiv_tensor, ctx);
+
+      MSHADOW_TYPE_SWITCH(x.type_flag_, XType, {
+        DType temp1 = (*(tensorinv_a_tensor.dptr_)) * (*(out_grad.dptr<DType>()));
+        DType temp2 = -temp1 * static_cast<DType>(*x.dptr<XType>());
+        ASSIGN_DISPATCH(*grad_b.dptr<DType>(), req[1], temp1);
+        ASSIGN_DISPATCH(*grad_a.dptr<DType>(), req[0], temp2);
+      });
+    } else {
+      // Two tensors of at least 1 dimensions.
+      const int N = b_shape.Size();
+      Tuple<int> a_axes_remained;
+      Tuple<int> a_axes;
+      Tuple<int> a_origin_axes;
+      // Use a_axes to transpose (a_shape) --> (a_reordered_shape).
+      GetReorderedAxes(a_axes_param, &a_axes_remained, &a_axes, a_shape);
+      // Use a_origin_axes to transpose (a_reordered_shape) --> (a_shape).
+      GetOriginAxes(a_shape, a_axes, &a_origin_axes);
+      mxnet::TShape reordered_a_shape = GetReorderedShape(a_shape, a_axes);
+
+      // Allocate workspace.
+      DType *tensorinv_a_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_);
+      DType *lu_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_ + a_shape.Size() * sizeof(DType));
+      DType *b_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_ + 2 * a_shape.Size() * sizeof(DType));
+      DType *x_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_ + (2 * a_shape.Size() + b_shape.Size()) * sizeof(DType));
+      DType *grad_a_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_ + 2 * (a_shape.Size() + b_shape.Size()) * sizeof(DType));
+      DType *grad_b_ptr = reinterpret_cast<DType*>(
+        workspace.dptr_ + (3 * a_shape.Size() + 2 * b_shape.Size()) * sizeof(DType));
+      int *ipiv_ptr = reinterpret_cast<int*>(
+        workspace.dptr_ + 3 * (a_shape.Size() + b_shape.Size()) * sizeof(DType));
+
+      TBlob tensorinv_a_data(tensorinv_a_ptr, a_shape, xpu::kDevMask);
+      TBlob lu_data(lu_ptr, a_shape, xpu::kDevMask);
+      TBlob b_data(b_ptr, b_shape, xpu::kDevMask);
+      TBlob x_data(x_ptr, x_shape, xpu::kDevMask);
+      TBlob grad_a_data(grad_a_ptr, reordered_a_shape, xpu::kDevMask);
+      TBlob grad_b_data(grad_b_ptr, b_shape, xpu::kDevMask);
+      TBlob ipiv_data(ipiv_ptr, b_shape, xpu::kDevMask);
+      MSHADOW_TYPE_SWITCH(a.type_flag_, AType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, a_shape.Size(),
+          lu_ptr,
+          a.dptr<AType>());
+      });
+      MSHADOW_TYPE_SWITCH(b.type_flag_, BType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, b_shape.Size(),
+          b_ptr,
+          b.dptr<BType>());
+      });
+      MSHADOW_TYPE_SWITCH(x.type_flag_, XType, {
+        mxnet_op::Kernel<mshadow_op::identity_with_cast, xpu>::Launch(
+          s, x_shape.Size(),
+          x_ptr,
+          x.dptr<XType>());
+      });
+      // Eg: lu_data(2, 3, 2, 15, 4, 5) -> tensorinv_a_data(3, 4, 5, 15, 2, 2)
+      tensorinv_a_data = tensorinv_a_data.reshape(reordered_a_shape);
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx,
+                                    lu_data,           // src
+                                    tensorinv_a_data,  // res
+                                    mxnet::TShape(a_axes.begin(), a_axes.end()));
+
+      Tensor<xpu, 3, DType> tensorinv_a_tensor =
+        tensorinv_a_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, N), s);
+      Tensor<xpu, 3, DType> lu_tensor =
+        lu_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, N), s);
+      Tensor<xpu, 3, DType> b_tensor =
+        b_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, 1), s);
+      Tensor<xpu, 3, DType> x_tensor =
+        x_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, 1), s);
+      Tensor<xpu, 3, DType> grad_a_tensor =
+        grad_a_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, N), s);
+      Tensor<xpu, 3, DType> grad_b_tensor =
+        grad_b_data.get_with_shape<xpu, 3, DType>(Shape3(1, N, 1), s);
+      Tensor<xpu, 2, int> ipiv_tensor =
+        ipiv_data.get_with_shape<xpu, 2, int>(Shape2(1, N), s);
+
+      // Calculate tensorinv(a).
+      batch_inverse(tensorinv_a_tensor, lu_tensor, ipiv_tensor, ctx);
+      // No need to transpose tensorinv_a
+      // because transpose(tensorinv_a).shape == reordered_a_shape.
+      laop::op(out_grad.get_with_shape<xpu, 3, DType>(x_tensor.shape_, s),
+               tensorinv_a_tensor,
+               b_tensor,
+               x_tensor,
+               grad_a_tensor,
+               grad_b_tensor,
+               ctx);
+      // Eg: grad_a_src(3, 4, 5, 15, 2, 2) --> lu_data(2, 3, 2, 15, 4, 5)
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx,
+                                    grad_a_data,  // src
+                                    lu_data,      // res
+                                    mxnet::TShape(a_origin_axes.begin(), a_origin_axes.end()));
+
+      MXNET_ASSIGN_REQ_SWITCH(req[1], req_type, {
+        mxnet_op::Kernel<assign_helper<req_type>, xpu>::Launch(
+          s, b_shape.Size(), grad_b_tensor.dptr_, grad_b.dptr<DType>());
+      });
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        mxnet_op::Kernel<assign_helper<req_type>, xpu>::Launch(
+          s, a_shape.Size(), lu_tensor.dptr_, grad_a.dptr<DType>());
+      });
+    }
+  });
+}
+
+template<typename xpu, typename laop>
+void TensorsolveOpBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_EQ(req.size(), 2U);
+
+  const TBlob& out_grad = inputs[0];
+  const TBlob& a = inputs[1];
+  const TBlob& b = inputs[2];
+  const TBlob& x = inputs[3];
+  const TBlob& grad_a = outputs[0];
+  const TBlob& grad_b = outputs[1];
+  const mxnet::TShape a_shape = a.shape_;
+  const mxnet::TShape b_shape = b.shape_;
+  const TensorsolveParam& param = nnvm::get<TensorsolveParam>(attrs.parsed);
+  mxnet::Tuple<int> a_axes_param = param.a_axes;
+  FixNegativeAxes(&a_axes_param, a_shape);
+
+  size_t workspace_size = TensorsolveBackwardWorkspaceSize<xpu>(out_grad, a, b, x);
+  Tensor<xpu, 1, char> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size),
+                                                   ctx.get_stream<xpu>());
+  TensorsolveBackwardImpl<xpu, laop>(a_axes_param,
+                                     out_grad,
+                                     a, b, x,
+                                     grad_a, grad_b,
+                                     ctx, req,
+                                     workspace);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_LINALG_NP_TENSORSOLVE_INL_H_
diff --git a/src/operator/numpy/linalg/np_tensorsolve.cc b/src/operator/numpy/linalg/np_tensorsolve.cc
new file mode 100644
index 000000000000..1dabcdd0eac4
--- /dev/null
+++ b/src/operator/numpy/linalg/np_tensorsolve.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file np_tensorsolve.cc
+ * \brief CPU implementation placeholder of Tensor Solve Operator
+ */
+#include "./np_tensorsolve-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool TensorsolveOpShape(const nnvm::NodeAttrs& attrs,
+                        mxnet::ShapeVector *in_attrs,
+                        mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  const mxnet::TShape& a_shape = in_attrs->at(0);
+  const mxnet::TShape& b_shape = in_attrs->at(1);
+  const int a_ndim = a_shape.ndim();
+  const int b_ndim = b_shape.ndim();
+
+  if (!ndim_is_known(a_shape) || !ndim_is_known(b_shape)) {
+    return false;
+  }
+
+  if (0 == a_ndim && 0 == b_ndim) {
+    // a and b is scalar
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, b_shape);
+  } else if (0 == a_ndim && 0 != b_ndim) {
+    // a is scalar, b is tensor
+    CHECK_EQ(b_shape.Size(), 1U)
+      << "a's and b's dimensions don't match";
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, a_shape);
+  } else if (0 != a_ndim && 0 == b_ndim) {
+    // a is tensor, a is scalar
+    CHECK_EQ(a_shape.Size(), 1U)
+      << "a's and b's dimensions don't match";
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, a_shape);
+  } else {
+    // a and b of at least 1 dimensions.
+    const TensorsolveParam& param = nnvm::get<TensorsolveParam>(attrs.parsed);
+    mxnet::Tuple<int> a_axes_param = param.a_axes;
+    FixNegativeAxes(&a_axes_param, a_shape);
+
+    mxnet::Tuple<int> a_axes_remained;
+    mxnet::Tuple<int> a_axes;
+    GetReorderedAxes(a_axes_param, &a_axes_remained, &a_axes, a_shape);
+    mxnet::TShape a_transpose_shape = GetReorderedShape(a_shape, a_axes);
+
+    // Calculate output shape
+    const int temp = a_ndim > b_ndim ? b_ndim : b_ndim - a_ndim;
+    int prod_front = 1, prod_back = 1;
+    mxnet::TShape out_shape(a_ndim - temp > 0 ? a_ndim - temp : 0, -1);
+    for (int i = 0; i < a_ndim; ++i) {
+      if (i < temp) {
+        prod_front *= a_transpose_shape[i];
+      } else {
+        prod_back *= a_transpose_shape[i];
+        out_shape[i - temp] = a_transpose_shape[i];
+      }
+    }
+    CHECK_EQ(prod_front, prod_back) << "a shape must be square.";
+    CHECK_EQ(prod_back, b_shape.Size()) << "a's and b's dimensions don't match";
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+  }
+
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
+}
+
+inline bool TensorsolveOpType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int>* in_attrs,
+                              std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  int a_type = in_attrs->at(0);
+  int b_type = in_attrs->at(1);
+  // unsupport float16
+  CHECK_NE(a_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg";
+  CHECK_NE(b_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg";
+  if (mshadow::kFloat32 == a_type && mshadow::kFloat32 == b_type) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat64);
+  }
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(TensorsolveParam);
+
+NNVM_REGISTER_OP(_npi_tensorsolve)
+.set_attr_parser(mxnet::op::ParamParser<TensorsolveParam>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a", "b"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", TensorsolveOpShape)
+.set_attr<nnvm::FInferType>("FInferType", TensorsolveOpType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
+  })
+.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+.set_attr<FCompute>("FCompute<cpu>", TensorsolveOpForward<cpu, tensorsolve>)
+.set_attr<nnvm::FGradient>("FGradient",
+  mxnet::op::ElemwiseGradUseInOut{"_backward_npi_tensorsolve"})
+.add_argument("a", "NDArray-or-Symbol", "First input")
+.add_argument("b", "NDArray-or-Symbol", "Second input")
+.add_arguments(TensorsolveParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_npi_tensorsolve)
+.set_attr_parser(mxnet::op::ParamParser<TensorsolveParam>)
+.set_num_inputs(4)
+.set_num_outputs(2)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& ){
+    return std::vector<ResourceRequest>{1, ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", TensorsolveOpBackward<cpu, tensorsolve_backward>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_tensorsolve.cu b/src/operator/numpy/linalg/np_tensorsolve.cu
new file mode 100644
index 000000000000..07e2121750d5
--- /dev/null
+++ b/src/operator/numpy/linalg/np_tensorsolve.cu
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_tensorsolve.cu
+ * \brief GPU implementation placeholder of Tensor Solve Operator
+ */
+
+#include <mxnet/operator_util.h>
+#include "./np_tensorsolve-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_CUSOLVER == 1
+
+NNVM_REGISTER_OP(_npi_tensorsolve)
+.set_attr<FCompute>("FCompute<gpu>", TensorsolveOpForward<gpu, tensorsolve>);
+
+NNVM_REGISTER_OP(_backward_npi_tensorsolve)
+.set_attr<FCompute>("FCompute<gpu>", TensorsolveOpBackward<gpu, tensorsolve_backward>);
+
+#endif
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 7d0025a62ad2..0efe2c2aa3df 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -86,6 +86,21 @@ struct NumpyReduceAxesNoDTypeParam : public dmlc::Parameter<NumpyReduceAxesNoDTy
   }
 };
 
+struct NumpyReduceAxesBoolParam : public dmlc::Parameter<NumpyReduceAxesBoolParam> {
+  dmlc::optional<mxnet::Tuple<int>> axis;
+  bool keepdims;
+  DMLC_DECLARE_PARAMETER(NumpyReduceAxesBoolParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .set_default(dmlc::optional<mxnet::Tuple<int>>())
+      .describe("Axis or axes along which a sum is performed. The default, axis=None, will sum "
+                "all of the elements of the input array. If axis is negative it counts from the "
+                "last to the first axis.");
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+  }
+};
+
 inline TShape NumpyReduceAxesShapeImpl(const TShape& ishape,
                                        const dmlc::optional<mxnet::Tuple<int>>& axis,
                                        bool keepdims) {
@@ -173,6 +188,20 @@ inline bool NumpyReduceAxesShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(out_attrs->at(0));
 }
 
+inline bool NumpyReduceAxesBoolShape(const nnvm::NodeAttrs& attrs,
+                                     std::vector<TShape> *in_attrs,
+                                     std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if (!shape_is_known(in_attrs->at(0))) {
+    return false;
+  }
+  const NumpyReduceAxesBoolParam& param = nnvm::get<NumpyReduceAxesBoolParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+                     NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims));
+  return shape_is_known(out_attrs->at(0));
+}
+
 inline bool NumpyReduceAxesNoDTypeShape(const nnvm::NodeAttrs& attrs,
                                         std::vector<TShape> *in_attrs,
                                         std::vector<TShape> *out_attrs) {
@@ -298,6 +327,30 @@ void NumpyReduceAxesNoDTypeCompute(const nnvm::NodeAttrs& attrs,
   ReduceAxesComputeImpl<xpu, reducer, false, false, OP>(ctx, inputs, req, outputs, small);
 }
 
+template<typename xpu, typename reducer, typename OP = op::mshadow_op::NonZero>
+void NumpyReduceAxesBoolCompute(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  const NumpyReduceAxesBoolParam& param = nnvm::get<NumpyReduceAxesBoolParam>(attrs.parsed);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (inputs[0].shape_.Size() == 0 && outputs[0].shape_.Size() != 0) {
+    using namespace mxnet_op;
+    Kernel<set_false, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<bool>());
+    return;
+  }
+  if (param.axis.has_value() && param.axis.value().ndim() == 0) {
+    UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs);
+  }
+  TShape small;
+  if (param.keepdims) {
+    small = outputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true);
+  }
+  ReduceAxesComputeBoolImpl<xpu, reducer, false, false, OP>(ctx, inputs, req, outputs, small);
+}
 
 template<typename xpu, bool normalize = false>
 inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/numpy/np_broadcast_reduce_op_boolean.cc b/src/operator/numpy/np_broadcast_reduce_op_boolean.cc
new file mode 100644
index 000000000000..7529c0d4e1d3
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_boolean.cc
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_boolean.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on boolean.
+ */
+
+#include "./np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool NumpyReduceAxesBoolType(const nnvm::NodeAttrs& attrs,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kBool);
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(NumpyReduceAxesBoolParam);
+
+NNVM_REGISTER_OP(_np_any)
+.set_attr_parser(ParamParser<NumpyReduceAxesBoolParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesBoolShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesBoolType)
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBoolCompute<cpu,
+  mshadow_op::sum, mshadow_op::NonZero>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NumpyReduceAxesBoolParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_np_all)
+.set_attr_parser(ParamParser<NumpyReduceAxesBoolParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesBoolShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesBoolType)
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBoolCompute<cpu,
+  mshadow_op::product, mshadow_op::NonZero>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NumpyReduceAxesBoolParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_boolean.cu b/src/operator/numpy/np_broadcast_reduce_op_boolean.cu
new file mode 100644
index 000000000000..2c206bf88b2f
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_boolean.cu
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_boolean.cu
+ * \brief GPU Implementation of broadcast and reduce functions based on boolean.
+ */
+
+#include "./np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_any)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBoolCompute<gpu,
+  mshadow_op::sum, mshadow_op::NonZero>);
+
+NNVM_REGISTER_OP(_np_all)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBoolCompute<gpu,
+  mshadow_op::product, mshadow_op::NonZero>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 2a1bc5261701..cf92da52d1f8 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -161,6 +161,7 @@ inline bool NumpyReduceAxesNoDTypeType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_np_max)
+.add_alias("_np_amax")
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/operator/numpy/np_delete_op-inl.h b/src/operator/numpy/np_delete_op-inl.h
new file mode 100644
index 000000000000..a144833f3294
--- /dev/null
+++ b/src/operator/numpy/np_delete_op-inl.h
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_delete_op-inl.h
+ * \brief Function definition of delete operators
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_DELETE_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_DELETE_OP_INL_H_
+
+#include <vector>
+#include <memory>
+#include <algorithm>
+#include "../../common/utils.h"
+#include "../tensor/sort_op.h"
+#include "../tensor/init_op.h"
+#include "../operator_common.h"
+#include "../mxnet_op.h"
+#include "../tensor/broadcast_reduce_op.h"
+#ifdef __CUDACC__
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+
+namespace mxnet {
+namespace op {
+
+struct NumpyDeleteParam : public dmlc::Parameter<NumpyDeleteParam> {
+  dmlc::optional<int> start;
+  dmlc::optional<int> stop;
+  dmlc::optional<int> step;
+  dmlc::optional<int> int_ind;
+  dmlc::optional<int> axis;
+  DMLC_DECLARE_PARAMETER(NumpyDeleteParam) {
+    DMLC_DECLARE_FIELD(start)
+    .set_default(dmlc::optional<int>())
+    .describe("If 'obj' is slice, 'start' is one of it's arguments.");
+    DMLC_DECLARE_FIELD(stop)
+    .set_default(dmlc::optional<int>())
+    .describe("If 'obj' is slice, 'stop' is one of it's arguments.");
+    DMLC_DECLARE_FIELD(step)
+    .set_default(dmlc::optional<int>())
+    .describe("If 'obj' is slice, 'step' is one of it's arguments.");
+    DMLC_DECLARE_FIELD(int_ind)
+    .set_default(dmlc::optional<int>())
+    .describe("If 'obj' is int, 'int_ind' is the index before which"
+              "'values' is inserted");
+    DMLC_DECLARE_FIELD(axis)
+    .set_default(dmlc::optional<int>())
+    .describe("Axis along which to insert `values`.");
+  }
+};
+
+namespace delete_ {
+
+enum DeleteOpInputs {kArr, kObj};
+enum DeleteOpOutputs {kOut};
+}  // namespace delete_
+
+struct SliceToIndices {
+  /*!
+   * \brief transfer slice to indices array
+   */
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* indices, int start, int step) {
+    indices[i] = start + i * step;
+  }
+};
+
+struct IsDeleteCal {
+  /*!
+   * \brief indicate which indices need to be deleted in input
+   * \param N used to check indices legality 
+   * \param is_delete if is_delete[i] == False, index i needn't to be deleted from output
+   *                  if is_delete[i] == True, index i need to be deleted from output
+   * \param indices the indices need to be deleted
+   */
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, int N, bool* is_delete, const IType* indices) {
+    if ((indices[i] >= 0) && (indices[i] < N)) {
+      is_delete[static_cast<int>(indices[i])] = true;
+    }
+  }
+};
+
+struct OutPosCal {
+  /*!
+   * \brief map the index from input to output. e.g.
+   * \example original_position 0 1 2 3 4
+   *          is_delete         F T T F F
+   *          out_position      0 - - 1 2
+   */
+  MSHADOW_XINLINE static void Map(int i, int64_t* out_pos, const bool* is_delete) {
+    if (!is_delete[i]) {
+      int cnt = 0;
+      for (int j = 0; j < i; ++j) {
+        if (!is_delete[j]) {
+          cnt++;
+        }
+      }
+      out_pos[i] = cnt;
+    }
+  }
+};
+
+template<int req, int ndim>
+struct DeleteKernel {
+  /*!
+   * \brief delete a sub-array from input along an axis according to 'is_delete'.
+   * \param out_data - output: a new array with sub-arrays along an axis deleted.
+   * \param in_arr - input: 'arr', original array.
+   * \param is_delete - mark where will be deleted or be reminded in 'arr'
+   * \param out_pos - if is_delete[i] is 'false', out_pos[i] indicates its.
+   * \param arrshape - the shape of 'arr'.
+   * \param out_stride - the stride of 'out_data'.
+   * \param axis - delete sub-array along this axis
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+                                  const DType* in_arr,
+                                  const bool* is_delete,
+                                  const int64_t* out_pos,
+                                  const mshadow::Shape<ndim> arrshape,
+                                  const mshadow::Shape<ndim> out_stride,
+                                  const int axis) {
+    // i -> position in in_arr's shape
+    mshadow::Shape<ndim> arr_idx = mxnet_op::unravel(i, arrshape);
+    if (!is_delete[arr_idx[axis]]) {
+      arr_idx[axis] = out_pos[arr_idx[axis]];
+      int64_t dest_idx = mxnet_op::dot(arr_idx, out_stride);
+      KERNEL_ASSIGN(out_data[dest_idx], req, in_arr[i]);
+    }
+  }
+};
+
+/*!
+ * /brief equals to numpy's slice.indices(range)
+ * /param pstart - slice.start
+ * /param pstep - slice.step
+ * /param pstop - slice.stop
+ * /return start - slice.indices(range).start
+ * /return stop - slice.indices(range).stop
+ * /return step - slice.indices(range).step
+ * /return tot - total number of slice.indices(range)
+ */
+inline void SliceIndices(const dmlc::optional<int>& pstart,
+                         const dmlc::optional<int>& pstop,
+                         const dmlc::optional<int>& pstep,
+                         const int range,
+                         int* start, int* stop, int* step,
+                         size_t* tot) {
+  *step = pstep.has_value() ? pstep.value() : 1;
+  CHECK_NE(*step, 0) << "'step' can not equal to 0.";
+  if (pstop.has_value()) {
+    *stop = pstop.value();
+    *stop += (*stop < 0) ? range : 0;
+    *stop = (*stop < 0) ? ((*step < 0) ? -1 : 0) : *stop;
+    *stop = (*stop >= range) ? ((*step < 0) ? range - 1 : range) : *stop;
+  } else {
+    *stop = (*step > 0) ? range : -1;
+  }
+  if (pstart.has_value()) {
+    *start = pstart.value();
+    *start += (*start < 0) ? range : 0;
+    *start = (*start < 0) ? ((*step < 0) ? -1 : 0) : *start;
+    *start = (*start >= range) ? ((*step < 0) ? range - 1 : range) : *start;
+  } else {
+    *start = (*step > 0) ? 0 : range - 1;
+  }
+  if (*step > 0 && *stop >= *start) {
+    *tot = static_cast<size_t>((*stop - *start + *step - 1) / *step);
+  } else if (*step < 0 && *stop <= *start) {
+    *tot = static_cast<size_t>((*stop - *start + *step + 1) / *step);
+  }
+}
+
+template<typename xpu>
+void NumpyDeleteCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  const NumpyDeleteParam& param = nnvm::get<NumpyDeleteParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(),
+          (param.step.has_value() || param.int_ind.has_value()) ? 1U : 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  int ndim = inputs[delete_::kArr].shape().ndim();
+  int axis = param.axis.has_value() ? param.axis.value() : -1;
+  NDArray arr;  // original array
+
+  if (!param.axis.has_value()) {
+    arr = inputs[delete_::kArr].Reshape(Shape1(inputs[delete_::kArr].shape().Size()));
+    ndim = 1;
+    axis = -1;
+  } else {
+    arr = inputs[delete_::kArr];
+  }
+
+  if (ndim == 0) {
+    const_cast<NDArray &>(outputs[delete_::kOut]).Init(arr.shape());
+    mxnet_op::copy(s, outputs[delete_::kOut].data(), inputs[delete_::kArr].data());
+    return;
+  }
+
+  axis = CheckAxis(axis, ndim);
+  int N = (arr.shape())[axis];
+  mxnet::TShape outshape(arr.shape());
+  // if obj is slice, they're obj's arguments
+  int start = 0, stop = 0, step = 0;
+  // total number to be deleted
+  size_t numtodel = 0;
+  // if obj is scaler, index is it's value
+  int index = 0;
+
+  if (param.step.has_value()) {  // obj is slice
+    SliceIndices(param.start, param.stop, param.step,
+                 N, &start, &stop, &step, &numtodel);
+    if (numtodel == 0) {
+      const_cast<NDArray &>(outputs[delete_::kOut]).Init(arr.shape());
+      mxnet_op::copy(s, outputs[delete_::kOut].data(), inputs[delete_::kArr].data());
+      return;
+    }
+    outshape[axis] -= numtodel;
+    const_cast<NDArray &>(outputs[delete_::kOut]).Init(outshape);
+  } else if (param.int_ind.has_value()) {  // obj is scaler
+    index = param.int_ind.value();
+    CHECK((index >= -1 * N) && (index < N))
+      << "index " << index
+      << " is out of bounds for axis " << axis
+      << " with size " << N << "\n";
+    index += ((index < 0) ? N : 0);
+    numtodel = static_cast<size_t>(1);
+    outshape[axis] -= 1;
+    const_cast<NDArray &>(outputs[delete_::kOut]).Init(outshape);
+  } else {  // obj is tensor
+    numtodel = inputs[delete_::kObj].shape().Size();
+  }
+
+  char* out_pos_ptr = NULL;
+  char* indices_ptr = NULL;
+  char* is_delete_ptr = NULL;
+  MSHADOW_TYPE_SWITCH(((inputs.size() == 2U) ?  // obj is tensor
+                      inputs[delete_::kObj].dtype() :
+                      mshadow::DataType<int64_t>::kFlag), IType, {
+    size_t temp_mem_size = sizeof(int64_t) * arr.shape()[axis] +
+                           sizeof(IType) * numtodel +
+                           sizeof(bool) * arr.shape()[axis];
+    Tensor<xpu, 1, char> temp_mem =
+      ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s);
+    out_pos_ptr = temp_mem.dptr_;
+    indices_ptr = out_pos_ptr + sizeof(int64_t) * arr.shape()[axis];
+    is_delete_ptr = indices_ptr + sizeof(IType) * numtodel;
+    if (param.step.has_value()) {  // obj is slice, transfer slice to tensor
+      Kernel<SliceToIndices, xpu>::Launch(
+        s, numtodel, reinterpret_cast<IType*>(indices_ptr), start, step);
+    } else if (param.int_ind.has_value()) {  // obj is scaler, copy it to tensor
+      Fill(s, TBlob(reinterpret_cast<IType*>(indices_ptr),
+           Shape1(numtodel), xpu::kDevMask), kWriteTo, index);
+    } else {  // obj is tensor, copy it to a unified tensor
+      mxnet_op::copy(s,
+        TBlob(reinterpret_cast<IType*>(indices_ptr), inputs[delete_::kObj].shape(),
+              inputs[delete_::kObj].data().dev_mask()),
+        inputs[delete_::kObj].data());
+    }
+    mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
+      s, arr.shape()[axis], reinterpret_cast<bool*>(is_delete_ptr));
+    // mark which position need to be deleted from input arr
+    Kernel<IsDeleteCal, xpu>::Launch(
+      s, numtodel, N, reinterpret_cast<bool*>(is_delete_ptr),
+      reinterpret_cast<IType*>(indices_ptr));
+    // calculate output data's original position in input arr
+    Kernel<OutPosCal, xpu>::Launch(
+      s, arr.shape()[axis], reinterpret_cast<int64_t*>(out_pos_ptr),
+      reinterpret_cast<bool*>(is_delete_ptr));
+  });
+
+  if (inputs.size() == 2U) {  // obj is tensor
+    // get total number of nonredundant indices
+    #ifdef __CUDACC__
+      thrust::device_ptr<bool>is_delete_dev(reinterpret_cast<bool*>(is_delete_ptr));
+      thrust::device_vector<bool>vec_is_delete(is_delete_dev, is_delete_dev + arr.shape()[axis]);
+    #else
+      std::vector<bool>vec_is_delete(reinterpret_cast<bool*>(is_delete_ptr),
+                                     reinterpret_cast<bool*>(is_delete_ptr) + arr.shape()[axis]);
+    #endif
+    numtodel = 0;
+    for (int i = 0; i < arr.shape()[axis]; ++i) {
+      if (vec_is_delete[i]) {
+        numtodel++;
+      }
+    }
+    outshape[axis] -= numtodel;
+    const_cast<NDArray &>(outputs[delete_::kOut]).Init(outshape);
+  }
+
+  MSHADOW_TYPE_SWITCH(((inputs.size() == 2U) ?  // obj is tensor
+                      inputs[delete_::kObj].dtype() :
+                      mshadow::DataType<int64_t>::kFlag), IType, {
+    MXNET_NDIM_SWITCH(outshape.ndim(), ndim, {
+      mshadow::Shape<ndim> out_strides = mxnet_op::calc_stride(outshape.get<ndim>());
+      MSHADOW_TYPE_SWITCH(outputs[delete_::kOut].dtype(), DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[delete_::kOut], req_type, {
+          Kernel<DeleteKernel<req_type, ndim>, xpu>::Launch(
+            s, arr.shape().Size(),
+            outputs[delete_::kOut].data().dptr<DType>(),
+            arr.data().dptr<DType>(),
+            reinterpret_cast<bool*>(is_delete_ptr),
+            reinterpret_cast<int64_t*>(out_pos_ptr),
+            arr.shape().get<ndim>(),
+            out_strides, axis);
+        });
+      });
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_DELETE_OP_INL_H_
diff --git a/src/operator/numpy/np_delete_op.cc b/src/operator/numpy/np_delete_op.cc
new file mode 100644
index 000000000000..48840bf9d230
--- /dev/null
+++ b/src/operator/numpy/np_delete_op.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_delete_op.cc
+ * \brief CPU Implementation of numpy insert operations
+ */
+
+#include <vector>
+#include "./np_delete_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NumpyDeleteParam);
+
+bool NumpyDeleteType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *in_type,
+                     std::vector<int> *out_type) {
+  const NumpyDeleteParam& param = nnvm::get<NumpyDeleteParam>(attrs.parsed);
+  int insize = (param.step.has_value() || param.int_ind.has_value()) ? 1 : 2;
+  CHECK_EQ(in_type->size(), insize);
+  CHECK_EQ(out_type->size(), 1U);
+  if (insize == 3) {
+    CHECK_NE((*in_type)[1], -1) << "Index type must be set for insert operator\n";
+    CHECK(((*in_type)[1] == mshadow::DataType<int64_t>::kFlag) ||
+          ((*in_type)[1] == mshadow::DataType<int32_t>::kFlag))
+      << "Index type only support int32 or int64.\n";
+  }
+  TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);
+  TYPE_ASSIGN_CHECK(*in_type, 0, (*out_type)[0]);
+  return (*in_type)[0] != -1;
+}
+
+inline bool NumpyDeleteStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  const NumpyDeleteParam& param = nnvm::get<NumpyDeleteParam>(attrs.parsed);
+  unsigned int insize = (param.step.has_value() || param.int_ind.has_value()) ? 1U : 2U;
+  CHECK_EQ(in_attrs->size(), insize);
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (int &attr : *in_attrs) {
+    CHECK_EQ(attr, kDefaultStorage) << "Only default storage is supported";
+  }
+  for (int &attr : *out_attrs) {
+    attr = kDefaultStorage;
+  }
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  return true;
+}
+
+NNVM_REGISTER_OP(_npi_delete)
+.describe(R"code(Delete values along the given axis before the given indices.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NumpyDeleteParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NumpyDeleteParam& params = nnvm::get<NumpyDeleteParam>(attrs.parsed);
+  return (params.step.has_value() || params.int_ind.has_value()) ? 1U : 2U;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const NumpyDeleteParam& params = nnvm::get<NumpyDeleteParam>(attrs.parsed);
+    return (params.step.has_value() || params.int_ind.has_value()) ?
+            std::vector<std::string>{"arr"} :
+            std::vector<std::string>{"arr", "obj"};
+})
+.set_attr<nnvm::FInferType>("FInferType", NumpyDeleteType)
+.set_attr<mxnet::FComputeEx>("FComputeEx<cpu>", NumpyDeleteCompute<cpu>)
+.set_attr<FInferStorageType>("FInferStorageType", NumpyDeleteStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.add_argument("arr", "NDArray-or-Symbol", "Input ndarray")
+.add_argument("obj", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NumpyDeleteParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_delete_op.cu b/src/operator/numpy/np_delete_op.cu
new file mode 100644
index 000000000000..599d01788138
--- /dev/null
+++ b/src/operator/numpy/np_delete_op.cu
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.ø
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_delete_op.cu
+ * \brief GPU Implementation of numpy delete operations
+ */
+
+#include "./np_delete_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_delete)
+.set_attr<FComputeEx>("FComputeEx<gpu>", NumpyDeleteCompute<gpu>);
+
+}
+}
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 7ca594d21e59..9ca291391c5c 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -404,6 +404,8 @@ IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::smooth_l1_gradient);  // NO
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::lcm);  // NOLINT()
 IMPLEMENT_BLANK_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mxnet_op::set_to_int<0>);  // NOLINT()
 IMPLEMENT_BLANK_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mxnet_op::set_to_int<1>);  // NOLINT()
+IMPLEMENT_BLANK_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mxnet_op::set_to_bool<false>);  // NOLINT()
+IMPLEMENT_BLANK_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mxnet_op::set_to_bool<true>);  // NOLINT()
 IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::PopulateFullIdxRspKernel);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ldexp);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rldexp);  // NOLINT()
diff --git a/src/operator/quantization/quantized_elemwise_mul-inl.h b/src/operator/quantization/quantized_elemwise_mul-inl.h
new file mode 100644
index 000000000000..f58db8a45eea
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul-inl.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+
+#include "../tensor/elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseMulParam : public dmlc::Parameter<QuantizeElemwiseMulParam> {
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  bool enable_float_output;
+  DMLC_DECLARE_PARAMETER(QuantizeElemwiseMulParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output");
+  }
+};
+
+namespace quantized_elemwise_mul {
+enum QuantizedElemwiseMulOpInputs {kLhs, kRhs, kLhsMin, kLhsMax, kRhsMin, kRhsMax};
+enum QuantizedElemwiseMulOpOutputs {kOut, kOutMin, kOutMax};
+enum QuantizedElemwiseMulOpResource {kTempSpace};
+}  // namespace quantized_elemwise_mul
+
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
diff --git a/src/operator/quantization/quantized_elemwise_mul.cc b/src/operator/quantization/quantized_elemwise_mul.cc
new file mode 100644
index 000000000000..a752c14837a6
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul.cc
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#include <mxnet/op_attr_types.h>
+#include "../tensor/elemwise_binary_op-inl.h"
+#include "./quantized_elemwise_mul-inl.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseMulParam);
+
+static std::vector<std::string> QuantizedElemwiseMulOutputNames(const NodeAttrs &attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  if (params.enable_float_output)
+    return std::vector<std::string>{"output"};
+  else
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+}
+
+inline bool QuantizedElemwiseMulOpShape(const nnvm::NodeAttrs& attrs,
+                                        mxnet::ShapeVector *in_attrs,
+                                        mxnet::ShapeVector *out_attrs) {
+  using namespace mshadow;
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  const mxnet::TShape &lshape = (*in_attrs)[quantized_elemwise_mul::kLhs];
+  const mxnet::TShape &rshape = (*in_attrs)[quantized_elemwise_mul::kRhs];
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
+  CHECK_EQ(lshape.ndim(), rshape.ndim())
+    << "Currently, quantized elemwise multiply doesn't support broadcast.";
+  for (int i = 0; i < lshape.ndim(); ++i) {
+    CHECK_EQ(lshape[i], rshape[i]);
+  }
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMax, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMax, mxnet::TShape(1, 1));
+
+  out_attrs->clear();
+  SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOut, lshape);
+  if (!params.enable_float_output) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMin, mxnet::TShape(1, 1));
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMax, mxnet::TShape(1, 1));
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpType(const nnvm::NodeAttrs& attrs,
+                                       std::vector<int> *in_type,
+                                       std::vector<int> *out_type) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  for (int i = 0; i < 2; ++i) {
+    if (in_type->at(i) == mshadow::kInt8) {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+    } else {
+      LOG(ERROR) << "currently, quantized elemwise mul only support int8 inputs.";
+    }
+  }
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMax, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMax, mshadow::kFloat32);
+
+  int dtype = mshadow::kInt32;
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    dtype = mshadow::kInt8;
+  }
+  if (!params.enable_float_output) {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, dtype);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMin, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMax, mshadow::kFloat32);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, mshadow::kFloat32);
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpStorageType(const nnvm::NodeAttrs& attrs,
+                                              int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  using namespace common;
+  *dispatch_mode = DispatchMode::kFCompute;
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void QuantizedElemwiseMulOpForward(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  using namespace mxnet_op;
+
+  float lhs_min = inputs[quantized_elemwise_mul::kLhsMin].dptr<float>()[0];
+  float lhs_max = inputs[quantized_elemwise_mul::kLhsMax].dptr<float>()[0];
+  float rhs_min = inputs[quantized_elemwise_mul::kRhsMin].dptr<float>()[0];
+  float rhs_max = inputs[quantized_elemwise_mul::kRhsMax].dptr<float>()[0];
+
+  float cached_output_min_ = 0.f;
+  float cached_output_max_ = 0.f;
+  float out_data_scale = 1.f;
+  float out_scale = 1.f;
+  if (!params.enable_float_output) {
+    float output_data_range = kInt32Range;
+    // dataA && dataB are int8
+    if (outputs[quantized_elemwise_mul::kOut].type_flag_ == mshadow::kInt8) {
+      output_data_range = kInt8Range;
+    } else {
+      output_data_range = kInt32Range;
+    }
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      cached_output_min_ = params.min_calib_range.value();
+      cached_output_max_ = params.max_calib_range.value();
+      out_data_scale = output_data_range / MaxAbs(cached_output_min_, cached_output_max_);
+      auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+      auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+      out_scale = out_data_scale / lhs_scale / rhs_scale;
+    } else {
+      Stream<cpu> *s = ctx.get_stream<cpu>();
+      if (inputs[quantized_elemwise_mul::kLhs].type_flag_ == mshadow::kInt8 &&
+          inputs[quantized_elemwise_mul::kRhs].type_flag_ == mshadow::kInt8) {
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+            s, 1, &cached_output_min_, &cached_output_max_, &lhs_min, &lhs_max, &rhs_min, &rhs_max);
+      } else {
+        LOG(ERROR) << "lhs and rhs only support iny8 dtype.";
+      }
+    }
+  } else {
+    auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+    auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+    out_scale = 1.0 / lhs_scale / rhs_scale;
+  }
+
+  size_t out_size = outputs[quantized_elemwise_mul::kOut].Size();
+  auto *input_l = inputs[quantized_elemwise_mul::kLhs].dptr<int8_t>();
+  auto *input_r = inputs[quantized_elemwise_mul::kRhs].dptr<int8_t>();
+  // TODO(Xinyu): a temp solution to enable Elemwise INT8 computation,
+  // will be refactored after the DNNL primitive is done.
+  if (!params.enable_float_output) {
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      typedef int8_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    } else {
+      typedef int32_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    }
+  } else {
+    typedef float_t out_type;
+    auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+    for (size_t i = 0; i < out_size; ++i) {
+      const int8_t a = input_l[i];
+      const int8_t b = input_r[i];
+      out_data[i] = static_cast<out_type>(a * b * out_scale);
+    }
+  }
+
+  if (!params.enable_float_output) {
+    outputs[quantized_elemwise_mul::kOutMin].dptr<float>()[0] = cached_output_min_;
+    outputs[quantized_elemwise_mul::kOutMax].dptr<float>()[0] = cached_output_max_;
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_mul)
+.describe(R"code(Multiplies arguments int8 element-wise.
+)code" ADD_FILELINE)
+.set_num_inputs(6)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  return (!params.enable_float_output) ? 3 : 1;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs", "lhs_min", "lhs_max", "rhs_min", "rhs_max"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", QuantizedElemwiseMulOutputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedElemwiseMulOpShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedElemwiseMulOpType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedElemwiseMulOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", QuantizedElemwiseMulOpForward)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("lhs", "NDArray-or-Symbol", "first input")
+.add_argument("rhs", "NDArray-or-Symbol", "second input")
+.add_argument("lhs_min", "NDArray-or-Symbol", "Minimum value of first input.")
+.add_argument("lhs_max", "NDArray-or-Symbol", "Maximum value of first input.")
+.add_argument("rhs_min", "NDArray-or-Symbol", "Minimum value of second input.")
+.add_argument("rhs_max", "NDArray-or-Symbol", "Maximum value of second input.")
+.set_attr_parser(ParamParser<QuantizeElemwiseMulParam>)
+.add_arguments(QuantizeElemwiseMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(elemwise_mul)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_contrib_quantized_elemwise_mul");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
new file mode 100644
index 000000000000..1469395ec169
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_elemwisemul_post_quantize_property.cc
+ * \brief Partition gragph property for MKLDNN Quantized ElemwiseMul operator
+ * \author Xinyu Chen
+*/
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../../tensor/elemwise_binary_op-inl.h"
+#include "../../quantization/requantize-inl.h"
+#include "../common.h"
+#include "mkldnn_subgraph_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#define QUANTIZED_ElemwiseMul_NAME "_contrib_quantized_elemwise_mul"
+
+class ElemwiseMulPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kRequantize,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  bool disable_float_output;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit ElemwiseMulPostQuantizeSelector(const bool dis_all,
+                                           const bool dis_float_output)
+      : disable_all(dis_all),
+        disable_float_output(dis_float_output) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+      status = disable_all ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+        matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
+      }
+
+      status = kSuccess;
+      return false;
+    }
+
+    switch (status) {
+      case kStart:
+        if (new_node.op() == Op::Get("_contrib_requantize")) {
+          auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+          if (param.min_calib_range.has_value() &&
+              param.max_calib_range.has_value()) {
+            matched_list.push_back(&new_node);
+            status = kRequantize;
+            return true;
+          }
+        }
+      case kRequantize:
+        if ((!disable_float_output) && (new_node.op() == Op::Get("_contrib_dequantize"))) {
+            matched_list.push_back(&new_node);
+            status = kSuccess;
+            return true;
+        }
+      default:
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if ((status != kSuccess) || (matched_list.size() <= 1)) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
+    }
+  }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = ElemwiseMulPostQuantizeSelector(disable_all, disable_float_output);
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
+};
+
+class ElemwiseMulPostQuantizeProperty : public SubgraphProperty {
+ public:
+  ElemwiseMulPostQuantizeProperty() {
+    disable_fuse_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FUSE_ALL", false);
+    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FLOAT_OUTPUT", false);
+  }
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN EltwiseMul post-quantization optimization pass";
+    auto property = std::make_shared<ElemwiseMulPostQuantizeProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr em_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    nnvm::NodePtr dequantize_node = nullptr;
+
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      if (node->op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+        em_node = node;
+      } else if (node->op() == Op::Get("_contrib_requantize")) {
+        requantize_node = node;
+      } else if (node->op() == Op::Get("_contrib_dequantize")) {
+        dequantize_node = node;
+      }
+    });
+
+    CHECK_NOTNULL(em_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+
+    // When only fused quantized_elemwise_mul and requantize, set min/max_cablib_range,
+    // When fused quantized_elemwise_mul + requantize + dequantize, set dequantize flag to true.
+    if (dequantize_node != nullptr) {
+      em_node->attrs.dict["enable_float_output"] = "True";
+    } else {
+      em_node->attrs.dict["min_calib_range"] =
+          std::to_string(requantize_param.min_calib_range.value());
+      em_node->attrs.dict["max_calib_range"] =
+          std::to_string(requantize_param.max_calib_range.value());
+    }
+    em_node->op()->attr_parser(&(em_node->attrs));
+    return em_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<ElemwiseMulPostQuantizeSelector>(disable_fuse_all,
+                                                          disable_float_output);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  bool disable_fuse_all;
+  bool disable_float_output;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
index 269017ea6a03..18cd3031ef18 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -23,6 +23,7 @@
 #include "mkldnn_fc_property.h"
 #include "mkldnn_post_quantize_property.h"
 #include "mkldnn_fc_post_quantize_property.h"
+#include "mkldnn_elemwisemul_post_quantize_property.h"
 #include "mkldnn_post_quantize_align_scale_property.h"
 
 namespace mxnet {
@@ -57,6 +58,7 @@ MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
 
 #if MXNET_USE_MKLDNN == 1
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 41940e4b1e07..6cd7dd50657a 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -634,6 +634,16 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
   }
 }
 
+template<typename Reducer, int ndim, typename DType, typename OP, bool safe_acc = false>
+void ReduceBool(Stream<gpu> *s, const TBlob& small, const OpReqType req,
+                const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
+  if (req == kNullOp) return;
+  cudaStream_t stream = Stream<gpu>::GetStream(s);
+  ReduceImplConfig<ndim> config =
+    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, NULL, NULL);
+  ReduceImpl<Reducer, ndim, bool, DType, bool, OP>(stream, small, req, big, workspace, config);
+}
+
 template <typename Reducer, int ndim, typename DType, typename OP>
 void ReduceWithExtraMem(Stream<gpu>* s, const TBlob& small, const OpReqType req,
                         const Tensor<gpu, 1, char>& workspace, const TBlob& big) {};
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 0a20e1263fbf..841fbcd28a68 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -255,6 +255,18 @@ void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   }
 }
 
+template <typename Reducer, int ndim, typename DType, typename OP>
+void ReduceBool(Stream<cpu>* s, const TBlob& small, const OpReqType req,
+                const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
+  if (req == kNullOp) return;
+  Shape<ndim> rshape, rstride;
+  diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
+  size_t N = small.shape_.Size(), M = rshape.Size();
+  seq_reduce_compute<Reducer, ndim, bool, DType, bool, OP>(
+    N, M, req == kAddTo, big.dptr<DType>(), small.dptr<bool>(),
+    big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+}
+
 template <typename Reducer, int ndim, typename DType, typename OP>
 void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
                         const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cc b/src/operator/tensor/broadcast_reduce_norm_value.cc
index 4cd92d44997e..9acc157f8eca 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cc
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cc
@@ -40,7 +40,7 @@ void L2NormComputeEx<cpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cu b/src/operator/tensor/broadcast_reduce_norm_value.cu
index 188c93e61221..735c3d7faec9 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cu
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cu
@@ -39,7 +39,7 @@ void L2NormComputeEx<gpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 27e22491ca35..608e44dcfe76 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -636,6 +636,39 @@ void ReduceAxesComputeImpl(const OpContext& ctx,
   });
 }
 
+template<typename xpu, typename reducer, bool safe_acc, bool normalize = false,
+         typename OP = op::mshadow_op::NonZero>
+void ReduceAxesComputeBoolImpl(const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs,
+                               const mxnet::TShape& small) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  mxnet::TShape src_shape, dst_shape;
+  BroadcastReduceShapeCompact(inputs[0].shape_, small, &src_shape, &dst_shape);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[0].type_flag_, OType, {
+      const TBlob in_data = inputs[0].reshape(src_shape);
+      const TBlob out_data = outputs[0].reshape(dst_shape);
+      BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
+        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
+            s, out_data.shape_, req[0], in_data.shape_);
+        Tensor<xpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+        broadcast::ReduceBool<reducer, NDim, DType, OP>(
+            s, out_data, req[0], workspace, in_data);
+        if (normalize) {
+          auto out = out_data.FlatTo2D<xpu, OType>(s);
+          out /= scalar<OType>(src_shape.Size()/dst_shape.Size());
+        }
+      });
+    });
+  });
+}
+
 template<typename xpu, typename reducer, bool normalize = false,
          typename OP = op::mshadow_op::identity>
 void ReduceAxesCompute(const nnvm::NodeAttrs& attrs,
@@ -1152,7 +1185,7 @@ inline bool LpNormStorageType(const nnvm::NodeAttrs& attrs,
                                      DispatchMode::kFCompute);
   }
   if (param.ord == 2) {
-    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
     if (!dispatched && (in_stype == kRowSparseStorage || in_stype == kCSRStorage) &&
         axis.ndim() == 0 && param.ord == 2) {
       // l2 norm: rsp/csr, axis = () -> dns
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 43bfc729329a..31e0dd054afd 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -32,6 +32,41 @@ DMLC_REGISTER_PARAMETER(BroadcastAxesParam);
 DMLC_REGISTER_PARAMETER(BroadcastToParam);
 DMLC_REGISTER_PARAMETER(BroadcastLikeParam);
 
+template<typename DType>
+void BroadcastAxisKer(DType* src,
+                      DType* dst,
+                      index_t outer,
+                      index_t inner,
+                      index_t size) {
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+  for (index_t i = 0; i < outer * size; i++) {
+    const index_t m = i / size;
+    const index_t n = i % size;
+    void* offset = reinterpret_cast<void*>(dst + m * size * inner + n * inner);
+    memcpy(offset, reinterpret_cast<void*>(src + m * inner), inner * sizeof (DType));
+  }
+}
+
+inline void BroadcastAxisComputeCPU(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  const BroadcastAxesParam& param = nnvm::get<BroadcastAxesParam>(attrs.parsed);
+  if (param.axis.ndim() == 1 && inputs[0].shape_[param.axis[0]] == 1 && req[0] == kWriteTo) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      auto dst = outputs[0].dptr<DType>();
+      auto src = inputs[0].dptr<DType>();
+      index_t outer = inputs[0].shape_.ProdShape(0, param.axis[0]);
+      index_t inner = inputs[0].shape_.ProdShape(param.axis[0], inputs[0].shape_.ndim());
+      BroadcastAxisKer(src, dst, outer, inner, param.size[0]);
+    });
+  } else {
+    BroadcastComputeImpl<cpu>(attrs, ctx, inputs, req, outputs, inputs[0].shape_);
+  }
+}
+
 MXNET_OPERATOR_REGISTER_BROADCAST(broadcast_axis)
 .add_alias("broadcast_axes")
 .describe(R"code(Broadcasts the input array over particular axes.
@@ -59,7 +94,7 @@ Example::
 .set_attr_parser(ParamParser<BroadcastAxesParam>)
 .add_arguments(BroadcastAxesParam::__FIELDS__())
 .set_attr<mxnet::FInferShape>("FInferShape", BroadcastAxesShape)
-.set_attr<FCompute>("FCompute<cpu>", BroadcastCompute<cpu>);
+.set_attr<FCompute>("FCompute<cpu>", BroadcastAxisComputeCPU);
 
 MXNET_OPERATOR_REGISTER_BROADCAST(broadcast_to)
 .describe(R"code(Broadcasts the input array to a new shape.
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 0c501808a6c0..cd0bd8deeac3 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -1244,7 +1244,7 @@ void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(data.shape_, i, b, e, s, &vshape);
     }
-    MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(out.type_flag_, DType, {
       mxnet_op::Kernel<slice_assign_scalar<ndim>, xpu>::Launch(s, vshape.FlatTo2D()[0],
           out.dptr<DType>(), static_cast<DType>(param.scalar), req[0],
           out.shape_.get<ndim>(), vshape.get<ndim>(), begin, step);
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 527737e03cd7..0c40f32d1666 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -341,6 +341,66 @@ def check_quantized_elemwise_add(data_shape, qtype):
         check_quantized_elemwise_add((3, 4, 56, 56), qtype)
         check_quantized_elemwise_add((32, 56, 64, 11), qtype)
 
+@with_seed()
+def test_quantized_elemwise_mul():
+    def check_quantized_elemwise_mul(data_shape, qtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_elemwise_mul for native cpu since it is not supported yet')
+            return
+        elif qtype != 'int8':
+            print('skipped testing quantized_elemwise_mul for not supported data type')
+            return
+        elif is_test_for_gpu():
+            print('skipped testing quantized_elemwise_mul for gpu since it is not supported yet')
+            return
+
+        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
+        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
+        elemwise_mul_fp32 = mx.sym.elemwise_mul(dataA, dataB)
+        arg_names = elemwise_mul_fp32.list_arguments()
+        elemwise_mul_fp32_exe = elemwise_mul_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qtype == 'uint8':
+            data_low = 0.0
+            data_high = 255.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+
+        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        elemwise_mul_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
+
+        elemwise_mul_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
+
+        output = elemwise_mul_fp32_exe.forward()[0]
+
+        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
+        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
+        min_dataA = mx.sym.Variable(name='min_dataA')
+        max_dataA = mx.sym.Variable(name='max_dataA')
+        min_dataB = mx.sym.Variable(name='min_dataB')
+        max_dataB = mx.sym.Variable(name='max_dataB')
+        quantized_elemwise_mul = mx.sym.contrib.quantized_elemwise_mul(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
+        elemwise_mul_int8_exe = quantized_elemwise_mul.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_elemwise_mul.list_arguments()
+        elemwise_mul_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
+        elemwise_mul_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
+        quantized_range = 127.0
+        elemwise_mul_int8_exe.arg_dict[qarg_names[2]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[3]][:] = data_high
+        elemwise_mul_int8_exe.arg_dict[qarg_names[4]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[5]][:] = data_high
+        qoutput, min_range, max_range = elemwise_mul_int8_exe.forward()
+
+        fp32_rslt = output.asnumpy()
+        int8_rslt = qoutput.astype(output.dtype)
+        assert_almost_equal(fp32_rslt, int8_rslt, atol = 1e-4)
+
+    for qtype in ['int8', 'uint8']:
+        check_quantized_elemwise_mul((4, 6), qtype)
+        check_quantized_elemwise_mul((13, 74, 52), qtype)
+        check_quantized_elemwise_mul((3, 4, 56, 56), qtype)
+        check_quantized_elemwise_mul((32, 56, 64, 11), qtype)
 
 @with_seed()
 def test_quantized_pooling():
@@ -1005,7 +1065,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
                     else:
                         excluded_sym_names = excluded_names + optional_names
             if name == 'sym4':
-                excluded_op_names += ['elemwise_add']
+                excluded_op_names += ['elemwise_add', 'elemwise_mul']
 
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
                                                                              arg_params=arg_params,
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index a1e5128d8ac6..e7273fba35d5 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -18,6 +18,7 @@
 import mxnet as mx
 import numpy as np
 import scipy
+from scipy.stats import pearsonr
 import json
 import math
 from common import with_seed
@@ -267,7 +268,7 @@ def test_pearsonr():
     pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
     label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
     macro_pr = mx.metric.create('pearsonr', average='macro')
     micro_pr = mx.metric.create('pearsonr', average='micro')
 
@@ -289,7 +290,7 @@ def test_pearsonr():
     label12 = mx.nd.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
 
     pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
 
     macro_pr.reset()
     micro_pr.update([label2], [pred2])
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index fcdf547bfbec..bb45fead7cd9 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -59,6 +59,32 @@ def get_workloads(name):
         return OpArgMngr._args.get(name, None)
 
 
+def _add_workload_all():
+    # check bad element in all positions
+    for i in range(256-7):
+        e = np.array([True] * 256, dtype=bool)[7::]
+        e[i] = False
+        OpArgMngr.add_workload('all', e)
+    # big array test for blocked libc loops
+    for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+        e = np.array([True] * 100043, dtype=bool)
+        e[i] = False
+        OpArgMngr.add_workload('all', e)
+
+
+def _add_workload_any():
+    # check bad element in all positions
+    for i in range(256-7):
+        d = np.array([False] * 256, dtype=bool)[7::]
+        d[i] = True
+        OpArgMngr.add_workload('any', d)
+    # big array test for blocked libc loops
+    for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+        d = np.array([False] * 100043, dtype=bool)
+        d[i] = True
+        OpArgMngr.add_workload('any', d)
+
+
 def _add_workload_unravel_index():
     OpArgMngr.add_workload('unravel_index', indices=np.array([2],dtype=_np.int64), shape=(2, 2))
     OpArgMngr.add_workload('unravel_index', np.array([(2*3 + 1)*6 + 4], dtype=_np.int64), (4, 3, 6))
@@ -416,6 +442,66 @@ def _add_workload_linalg_tensorinv():
                 OpArgMngr.add_workload('linalg.tensorinv', np.array(a, dtype=dtype), ind)
 
 
+def _add_workload_linalg_tensorsolve():
+    shapes = [
+        # a_shape.ndim <= 6
+        # (a_shape, b_shape, axes)
+        ((1, 1), (1,), None),
+        ((1, 1), (1, 1, 1, 1, 1), None),
+        ((4, 4), (4,), None),
+        ((2, 3, 3, 4, 2), (3, 4), (0, 2, 4)),
+        ((1, 3, 3, 4, 4), (1, 3, 4), (1, 3)),
+        ((1, 4, 1, 12, 3), (1, 2, 1, 2, 1, 3, 1), (1, 2, 4)),
+    ]
+    dtypes = (np.float32, np.float64)
+    for dtype in dtypes:
+        for a_shape, b_shape, axes in shapes:
+            a_ndim = len(a_shape)
+            b_ndim = len(b_shape)
+            a_trans_shape = list(a_shape)
+            a_axes = list(range(0, a_ndim))
+            if axes is not None:
+                for k in axes:
+                    a_axes.remove(k)
+                    a_axes.insert(a_ndim, k)
+                for k in range(a_ndim):
+                    a_trans_shape[k] = a_shape[a_axes[k]]
+            x_shape = a_trans_shape[-(a_ndim - b_ndim):]
+            prod = 1
+            for k in x_shape:
+                prod *= k
+            if prod * prod != _np.prod(a_shape):
+                raise ValueError("a is not square")
+            if prod != _np.prod(b_shape):
+                raise ValueError("a's shape and b's shape dismatch")
+            mat_shape = (prod, prod)
+            a_trans_shape = tuple(a_trans_shape)
+            x_shape = tuple(x_shape)
+
+            a_np = _np.eye(prod)
+            shape = mat_shape
+            while 1:
+                # generate well-conditioned matrices with small eigenvalues
+                D = _np.diag(_np.random.uniform(-1.0, 1.0, shape[-1]))
+                I = _np.eye(shape[-1]).reshape(shape)
+                v = _np.random.uniform(-1., 1., shape[-1]).reshape(shape[:-1] + (1,))
+                v = v / _np.linalg.norm(v, axis=-2, keepdims=True)
+                v_T = _np.swapaxes(v, -1, -2)
+                U = I - 2 * _np.matmul(v, v_T)
+                a = _np.matmul(U, D)
+                if (_np.linalg.cond(a, 2) < 4):
+                    a_np = a.reshape(a_trans_shape)
+                    break
+            x_np = _np.random.randn(*x_shape)
+            b_np = _np.tensordot(a_np, x_np, axes=len(x_shape))
+            a_origin_axes = list(range(a_np.ndim))
+            if axes is not None:
+                for k in range(a_np.ndim):
+                    a_origin_axes[a_axes[k]] = k
+            a_np = a_np.transpose(a_origin_axes)
+            OpArgMngr.add_workload('linalg.tensorsolve', np.array(a_np, dtype=dtype), np.array(b_np, dtype=dtype), axes)
+
+
 def _add_workload_linalg_slogdet():
     OpArgMngr.add_workload('linalg.slogdet', np.array(_np.ones((2, 2)), dtype=np.float32))
     OpArgMngr.add_workload('linalg.slogdet', np.array(_np.ones((0, 1, 1)), dtype=np.float64))
@@ -594,6 +680,10 @@ def _add_workload_around():
     OpArgMngr.add_workload('around', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
 
 
+def _add_workload_round():
+    OpArgMngr.add_workload('round', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
+
+
 def _add_workload_argsort():
     for dtype in [np.int32, np.float32]:
         a = np.arange(101, dtype=dtype)
@@ -701,6 +791,20 @@ def _add_workload_max(array_pool):
     OpArgMngr.add_workload('max', array_pool['4x1'])
 
 
+def _add_workload_amax(array_pool):
+    a = np.array([3, 4, 5, 10, -3, -5, 6.0])
+    b = np.array([[3, 6.0, 9.0],
+                  [4, 10.0, 5.0],
+                  [8, 3.0, 2.0]])
+    c = np.array(1)
+    OpArgMngr.add_workload('amax', array_pool['4x1'])
+    OpArgMngr.add_workload('amax', a)
+    OpArgMngr.add_workload('amax', b, axis=0)
+    OpArgMngr.add_workload('amax', b, axis=1)
+    OpArgMngr.add_workload('amax', c)
+    OpArgMngr.add_workload('amax', c, axis=None)
+
+
 def _add_workload_min(array_pool):
     OpArgMngr.add_workload('min', array_pool['4x1'])
 
@@ -758,7 +862,7 @@ def _add_workload_reshape():
     # OpArgMngr.add_workload('reshape', b, (2, 2), order='F')  # Items are not equal with order='F'
 
     a = np.array(_np.ones((0, 2)))
-    OpArgMngr.add_workload('reshape', a, -1, 2)
+    OpArgMngr.add_workload('reshape', a, (-1, 2))
 
 
 def _add_workload_rint(array_pool):
@@ -855,6 +959,27 @@ def _add_workload_unique():
     # OpArgMngr.add_workload('unique', np.arange(10, dtype=np.uint8).reshape(-1, 2).astype(bool), axis=1)
 
 
+def _add_workload_delete():
+    a = np.arange(5)
+    nd_a = np.arange(5).repeat(2).reshape(1, 5, 2)
+    lims = [-6, -2, 0, 1, 2, 4, 5]
+    steps = [-3, -1, 1, 3]
+    for start in lims:
+        for stop in lims:
+            for step in steps:
+                s = slice(start, stop, step)
+                OpArgMngr.add_workload('delete', a, s)
+                OpArgMngr.add_workload('delete', nd_a, s, axis=1)
+    OpArgMngr.add_workload('delete', a, np.array([]), axis=0)
+    OpArgMngr.add_workload('delete', a, 0)
+    OpArgMngr.add_workload('delete', a, np.array([]))
+    OpArgMngr.add_workload('delete', a, np.array([0, 1]))
+    OpArgMngr.add_workload('delete', a, slice(1, 2))
+    OpArgMngr.add_workload('delete', a, slice(1, -2))
+    k = np.arange(10).reshape(2, 5)
+    OpArgMngr.add_workload('delete', k, slice(60, None), axis=1)
+
+
 def _add_workload_var(array_pool):
     OpArgMngr.add_workload('var', array_pool['4x1'])
     OpArgMngr.add_workload('var', np.array([np.float16(1.)]))
@@ -1439,9 +1564,12 @@ def _prepare_workloads():
         '1x1x0': np.array([[[]]])
     }
 
+    _add_workload_all()
+    _add_workload_any()
     _add_workload_argmin()
     _add_workload_argmax()
     _add_workload_around()
+    _add_workload_round()
     _add_workload_argsort()
     _add_workload_append()
     _add_workload_bincount()
@@ -1461,6 +1589,7 @@ def _prepare_workloads():
     _add_workload_fix()
     _add_workload_flip()
     _add_workload_max(array_pool)
+    _add_workload_amax(array_pool)
     _add_workload_min(array_pool)
     _add_workload_mean(array_pool)
     _add_workload_nonzero()
@@ -1482,6 +1611,7 @@ def _prepare_workloads():
     _add_workload_tile()
     _add_workload_transpose()
     _add_workload_unique()
+    _add_workload_delete()
     _add_workload_var(array_pool)
     _add_workload_zeros_like(array_pool)
     _add_workload_linalg_norm()
@@ -1490,6 +1620,7 @@ def _prepare_workloads():
     _add_workload_linalg_solve()
     _add_workload_linalg_det()
     _add_workload_linalg_tensorinv()
+    _add_workload_linalg_tensorsolve()
     _add_workload_linalg_slogdet()
     _add_workload_trace()
     _add_workload_tril()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index af9228d45991..9f746d4dd47c 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -493,6 +493,84 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
 
 
+@with_seed()
+@use_np
+def test_np_any():
+    class TestAny(HybridBlock):
+        def __init__(self, axis=None, keepdims=False) :
+            super(TestAny, self).__init__()
+            self._axis = axis
+            self._keepdims = keepdims
+             
+        def hybrid_forward(self, F, a):
+            return F.np.any(a, axis=self._axis, keepdims=self._keepdims)
+    
+    keepdims = [True, False]
+    axes = [True, False]
+    shapes = [(), (5, ), (10, ),
+              (2, 5), (5, 5), (10, 10),
+              (4, 4, 4), (4, 6, 9), (6, 6, 6),
+              (7, 8, 9, 10), (7, 9, 11, 13)]
+    dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
+    
+    combinations = itertools.product([False, True], shapes, dtypes, axes, keepdims)
+    for hybridize, shape, dtype, axis, keepdim in combinations:
+        ndim = len(shape)
+        samples = random.randint(0, ndim)
+        axis = None if not axis else tuple(random.sample([i for i in range(0, ndim)], samples))
+        x = np.random.normal(0, 1.0, size=shape).astype(dtype)
+        test_any = TestAny(axis=axis, keepdims=keepdim)
+        if hybridize:
+            test_any.hybridize()
+        y = test_any(x)
+        expected_ret = _np.any(x.asnumpy(), axis=axis, keepdims=keepdim)
+        assert_almost_equal(y.asnumpy(), expected_ret)
+
+        # test imperative
+        mx_outs = np.any(x, axis=axis, keepdims=keepdim)
+        np_outs = _np.any(x.asnumpy(), axis=axis, keepdims=keepdim)
+        assert_almost_equal(mx_outs.asnumpy(), np_outs)
+
+
+@with_seed()
+@use_np
+def test_np_all():
+    class TestAll(HybridBlock):
+        def __init__(self, axis=None, keepdims=False) :
+            super(TestAll, self).__init__()
+            self._axis = axis
+            self._keepdims = keepdims
+             
+        def hybrid_forward(self, F, a):
+            return F.np.all(a, axis=self._axis, keepdims=self._keepdims)
+    
+    keepdims = [True, False]
+    axes = [True, False]
+    shapes = [(), (5, ), (10, ),
+              (2, 5), (5, 5), (10, 10),
+              (4, 4, 4), (4, 6, 9), (6, 6, 6),
+              (7, 8, 9, 10), (7, 9, 11, 13)]
+    dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
+    
+    combinations = itertools.product([False, True], shapes, dtypes, axes, keepdims)
+    for hybridize, shape, dtype, axis, keepdim in combinations:
+        ndim = len(shape)
+        samples = random.randint(0, ndim)
+        axis = None if not axis else tuple(random.sample([i for i in range(0, ndim)], samples))
+        x = np.random.normal(0, 1.0, size=shape).astype(dtype)
+        test_all = TestAll(axis=axis, keepdims=keepdim)
+        if hybridize:
+            test_all.hybridize()
+        y = test_all(x)
+        expected_ret = _np.all(x.asnumpy(), axis=axis, keepdims=keepdim)
+        assert_almost_equal(y.asnumpy(), expected_ret)
+
+        # test imperative
+        mx_outs = np.all(x, axis=axis, keepdims=keepdim)
+        np_outs = _np.all(x.asnumpy(), axis=axis, keepdims=keepdim)
+        assert_almost_equal(mx_outs.asnumpy(), np_outs)
+
+
 @with_seed()
 @use_np
 def test_np_max_min():
@@ -2754,6 +2832,81 @@ def hybrid_forward(self, F, x):
             assert same(ret_mx.asnumpy(), ret_np)
 
 
+@with_seed()
+@use_np
+def test_np_delete():
+    class TestDelete(HybridBlock):
+        def __init__(self, obj, axis=None):
+            super(TestDelete, self).__init__()
+            self._obj = obj
+            self._axis = axis
+
+        def hybrid_forward(self, F, a):
+            return F.np.delete(a, self._obj, axis=self._axis)
+    
+    def GetSize(shp):
+        if len(shp) == 0:
+            return 0
+        else:
+            res = 1
+            shp_list = list(shp)
+            for x in shp:
+                res *= x
+            return res
+
+    def GetDimSize(shp, axis):
+        if axis is None:
+            return GetSize(shp)
+        shp_list = list(shp)
+        return shp_list[axis]
+
+    shape = [(), (0, ), (1, ), (2, 3), (2, 1, 4, 5)]
+    config = []
+    for shp in shape:
+        for ax in range(-1 * len(shp), len(shp), 2):
+            #test slice
+            for st in [-5, -2, 0, 2, 5, None]:
+                for ed in [-5, -2, 0, 2, 5, None]:
+                    for stp in [-5, -2, 2, 5, None]:
+                        config.append(tuple([shp, slice(st, ed, stp), None]))
+                        config.append(tuple([shp, slice(st, ed, stp), ax]))
+            #test iteger
+            for idx in range(-1 * GetDimSize(shp, ax), GetDimSize(shp, ax)):
+                config.append(tuple([shp, idx, ax]))
+            #test ndarray indices
+            idx =  _np.random.randint(-1 * shp[ax], shp[ax] + 1, size = (4)).tolist()
+            config.append(tuple([shp, idx, ax]))
+
+    for arr_shape, obj, axis in config:
+        for objtype in ['int32', 'int64']:
+            if type(obj) == list:
+                obj_mxnp = np.array(obj, dtype=objtype)
+                obj_onp = _np.array(obj, dtype=objtype)
+            elif type(obj) == slice:
+                obj_mxnp = obj
+                obj_onp = obj
+            else:
+                obj_mxnp = (_np.int32(obj) if objtype == 'int32' else _np.int64(obj)) 
+                obj_onp = (_np.int32(obj) if objtype == 'int32' else _np.int64(obj)) 
+            test_delete = TestDelete(obj=obj_mxnp, axis=axis)
+
+            a = mx.nd.random.uniform(-1.0, 1.0, shape=arr_shape).as_np_ndarray()
+            a.attach_grad()
+            expected_ret = _np.delete(a.asnumpy(), obj_onp, axis=axis)
+
+            with mx.autograd.record():
+                y = test_delete(a)
+
+            assert y.shape == expected_ret.shape
+            assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+
+            #test imperative
+            mx_out = np.delete(a, obj_mxnp, axis=axis)
+            np_out = _np.delete(a.asnumpy(), obj_onp, axis=axis)
+
+            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 @with_seed()
 @use_np
 def test_np_argmin_argmax():
@@ -3089,6 +3242,28 @@ def hybrid_forward(self, F, x):
                 assert out.shape == expected_shape
 
 
+@with_seed()
+@use_np
+def test_np_randn():
+    # Test shapes.
+    shapes = [
+        (3, 3),
+        (3, 4),
+        (0, 0),
+        (3, 3, 3),
+        (0, 0, 0),
+        (2, 2, 4, 3),
+        (2, 2, 4, 3),
+        (2, 0, 3, 0),
+        (2, 0, 2, 3)
+    ]
+    dtypes = ['float16', 'float32', 'float64']
+    for dtype in dtypes:
+        for shape in shapes:
+            data_mx = np.random.randn(*shape, dtype=dtype)
+            assert data_mx.shape == shape
+
+
 @with_seed()
 @use_np
 def test_random_seed():
@@ -3962,6 +4137,160 @@ def get_grad_A(A, ind):
     check_tensorinv(mx_out, a, ind)
 
 
+@with_seed()
+@use_np
+def test_np_linalg_tensorsolve():
+    class TestTensorsolve(HybridBlock):
+        def __init__(self, axes):
+            super(TestTensorsolve, self).__init__()
+            self._axes = axes
+
+        def hybrid_forward(self, F, a, b):
+            return F.np.linalg.tensorsolve(a, b, axes=self._axes)
+
+    def get_tensorsolve_backward(a_np, b_np, mx_out_np, a_axes, a_origin_axes, a_trans_shape):
+        if (a_np.ndim == 0 or b_np.ndim == 0) or (a_np.ndim == b_np.ndim):
+            a_shape = a_np.shape
+            b_shape = b_np.shape
+            a_np = a_np.reshape((1, 1))
+            b_np = b_np.reshape((1,))
+            mx_out_np = mx_out_np.reshape((1,))
+            dx = _np.ones_like(mx_out_np)
+            inv_a_temp_np = _np.linalg.inv(a_np)
+            grad_b = inv_a_temp_np[0][0] * dx[0]
+            grad_a = -grad_b * mx_out_np[0]
+            return grad_a.reshape(a_shape), grad_b.reshape(b_shape)
+        else:
+            dx = _np.ones_like(mx_out_np)
+            a_np = a_np.transpose(a_axes)
+            ind = a_np.ndim - mx_out_np.ndim
+            tensorinv_a_np = _np.linalg.tensorinv(a_np, ind=ind)
+            a_trans_axes = list(range(a_np.ndim))[a_np.ndim - ind:] + list(range(a_np.ndim))[:a_np.ndim - ind]
+            trans_tensorinv_a_np = tensorinv_a_np.transpose(a_trans_axes)
+            grad_b = _np.tensordot(trans_tensorinv_a_np, dx, axes=dx.ndim)
+            grad_a = _np.tensordot(grad_b, mx_out_np, axes=0)
+            grad_a = grad_a.transpose(a_origin_axes)
+            return -grad_a, grad_b.reshape(b_np.shape)
+
+    def check_tensorsolve(x, a_np, b_np, axes):
+        try:
+            x_expected = _np.linalg.tensorsolve(a_np, b_np, axes=axes)
+        except Exception as e:
+            print("a:", a_np)
+            print("a shape:", a_np.shape)
+            print("b", b_np)
+            print("b shape:", b_np.shape)
+            print(e)
+        else:
+            assert x.shape == x_expected.shape
+            assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
+
+    def shapeInfer(a_shape, b_shape, axes=None):
+        # b_shape - Right-hand tensor shape, which can be of any shape.
+        a_ndim = len(a_shape)
+        b_ndim = len(b_shape)
+        a_trans_shape = list(a_shape)
+        a_axes = list(range(0, a_ndim))
+        if axes is not None:
+            for k in axes:
+                a_axes.remove(k)
+                a_axes.insert(a_ndim, k)
+            for k in range(a_ndim):
+                a_trans_shape[k] = a_shape[a_axes[k]]
+        x_shape = a_trans_shape[-(a_ndim - b_ndim):]
+        prod = 1
+        for k in x_shape:
+            prod *= k
+        if prod * prod != _np.prod(a_shape):
+            raise ValueError("a is not square")
+        if prod != _np.prod(b_shape):
+            raise ValueError("a's shape and b's shape dismatch")
+        return a_axes, (prod, prod), tuple(a_trans_shape), tuple(x_shape)
+
+    def newInvertibleMatrix_2D(shape, max_cond=4):
+        while 1:
+            # generate well-conditioned matrices with small eigenvalues
+            D = _np.diag(_np.random.uniform(-1.0, 1.0, shape[-1]))
+            I = _np.eye(shape[-1]).reshape(shape)
+            v = _np.random.uniform(-1., 1., shape[-1]).reshape(shape[:-1] + (1,))
+            v = v / _np.linalg.norm(v, axis=-2, keepdims=True)
+            v_T = _np.swapaxes(v, -1, -2)
+            U = I - 2 * _np.matmul(v, v_T)
+            a = _np.matmul(U, D)
+            if (_np.linalg.cond(a, 2) < max_cond):
+                return a
+
+    shapes = [
+        # a_shape.ndim <= 6,
+        # (a_shape, b_shape, axes)
+        ((), (), None),                     # a.ndim == 0, b.ndim == 0, with axes must be None
+        ((), (1, 1, 1), None),              # a.ndim == 0, b.ndim != 0, with axes must be None
+        ((1, 1, 1), (), None),              # a.ndim != 0, b.ndim == 0, with axes == None
+        ((1, 1, 1), (), (0, 1, 2)),         # a.ndim != 0, b.ndim == 0, with axes != None
+        ((1, 1, 1), (1, 1, 1), None),       # a.ndim != 0, b.ndim != 0, a.ndim == b.ndim with axes == None
+        ((1, 1, 1), (1, 1, 1), (2, 0, 1)),  # a.ndim != 0, b.ndim != 0, a.ndim == b.ndim with axes != None
+        ((1, 1), (1,), None),               # a.ndim != 0, b.ndim != 0, a.ndim > b.ndim
+        ((1, 1), (1, 1, 1, 1, 1), None),    # a.ndim != 0, b.ndim != 0, a.ndim < b.ndim - a.ndim
+        ((4, 4), (4,), None),
+        ((6, 2, 3), (6,), None),
+        ((2, 3, 6), (6,), (0, 1)),
+        ((3, 4, 2, 3, 2), (3, 4), None),
+        ((2, 1, 4, 2, 4), (2, 4), (0, 1, 2)),
+        ((2, 3, 3, 4, 2), (3, 4), (0, 2, 4)),
+        ((1, 3, 3, 4, 4), (1, 3, 4), (1, 3)),
+        ((1, 12, 4, 1, 3), (1, 2, 1, 2, 1, 3, 1), None),
+        ((1, 4, 1, 12, 3), (1, 2, 1, 2, 1, 3, 1), (1, 2, 4)),
+    ]
+    dtypes = ['float32', 'float64']
+    for hybridize in [True, False]:
+        for dtype in dtypes:
+            for a_shape, b_shape, axes in shapes:
+                rtol = 1e-2 if dtype == 'float32' else 1e-3
+                atol = 1e-4 if dtype == 'float32' else 1e-5
+                test_tensorsolve = TestTensorsolve(axes)
+                if hybridize:
+                    test_tensorsolve.hybridize()
+
+                a_axes, mat_shape, a_trans_shape, x_shape = shapeInfer(a_shape, b_shape, axes)
+                # generate coefficient tensor a and right side tensor b
+                if (len(a_shape) == 0 or len(b_shape) == 0) or (len(a_shape) == len(b_shape)):
+                    a_np = _np.asarray(1).astype(dtype).reshape(a_shape)
+                    b_np = _np.asarray(2).astype(dtype).reshape(b_shape)
+                else:
+                    a_np = newInvertibleMatrix_2D(mat_shape, max_cond=3).reshape(a_trans_shape)
+                    x_np = _np.random.randn(*x_shape)
+                    b_np = _np.tensordot(a_np, x_np, axes=len(x_shape))
+
+                # resume original shape of tensor a
+                a_origin_axes = list(range(a_np.ndim))
+                if axes is not None:
+                    for k in range(a_np.ndim):
+                        a_origin_axes[a_axes[k]] = k
+                a_np = a_np.transpose(a_origin_axes)
+                a = np.array(a_np, dtype=dtype).reshape(a_shape)
+                b = np.array(b_np, dtype=dtype).reshape(b_shape)
+                a.attach_grad()
+                b.attach_grad()
+
+                with mx.autograd.record():
+                    mx_out = test_tensorsolve(a, b)
+                # check tensorsolve validity
+                assert mx_out.shape == x_shape
+                check_tensorsolve(mx_out, a.asnumpy(), b.asnumpy(), axes)
+
+                # check backward
+                if len(a_shape) != 0 and len(b_shape) != 0:
+                    mx.autograd.backward(mx_out)
+                    grad_a_expected, grad_b_expected = get_tensorsolve_backward(
+                        a.asnumpy(), b.asnumpy(), mx_out.asnumpy(), a_axes, a_origin_axes, a_trans_shape)
+                    assert_almost_equal(a.grad.asnumpy(), grad_a_expected, rtol=rtol, atol=atol)
+                    assert_almost_equal(b.grad.asnumpy(), grad_b_expected, rtol=rtol, atol=atol)
+
+                # check imperative once again
+                mx_out = test_tensorsolve(a, b)
+                check_tensorsolve(mx_out, a.asnumpy(), b.asnumpy(), axes)
+
+
 @with_seed()
 @use_np
 def test_np_linalg_det():
@@ -4413,6 +4742,38 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_round():
+    class TestRound(HybridBlock):
+        def __init__(self, decimals):
+            super(TestRound, self).__init__()
+            self.decimals = decimals
+
+        def hybrid_forward(self, F, x):
+            return F.np.round(x, self.decimals)
+
+    shapes = [(), (1, 2, 3), (1, 0)]
+    types = ['int32', 'int64', 'float32', 'float64']
+    for hybridize in [True, False]:
+        for oneType in types:
+            rtol, atol = 1e-3, 1e-5
+            for shape in shapes:
+                for d in range(-5, 6):
+                    test_round = TestRound(d)
+                    if hybridize:
+                        test_round.hybridize()
+                    x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
+                    np_out = _np.round(x.asnumpy(), d)
+                    mx_out = test_round(x)
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+                    mx_out = np.round(x, d)
+                    np_out = _np.round(x.asnumpy(), d)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+
 @with_seed()
 @use_np
 def test_np_nonzero():