diff --git a/tests/nightly/test_all.sh b/tests/nightly/test_all.sh
index ba8fd21cb29e..73f0f588fe90 100755
--- a/tests/nightly/test_all.sh
+++ b/tests/nightly/test_all.sh
@@ -124,6 +124,5 @@ juLog -name=Python.Multi.Lenet.Mnist -error=Error python multi_lenet.py
 
 # python: large tensor
 juLog -name=Python.LargeTensor -error=Fail python test_large_array.py
-juLog -name=Python.LargeTensor -error=Fail python test_large_array_mkldnn.py
 
 exit $errors
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index e018462feaed..09e9864e650a 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -209,9 +209,13 @@ def test_dot():
 def test_FullyConnected():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(SMALL_Y, SMALL_Y))
+    c = nd.ones(shape=(b.shape[0],))
     res = nd.FullyConnected(a, b, num_hidden=b.shape[0], no_bias=True)
     assert np.sum(res[-1].asnumpy() == a.shape[1]) == b.shape[0]
 
+    res = nd.FullyConnected(a, b, c, num_hidden=b.shape[0], no_bias=False)
+    assert np.sum(res[-1].asnumpy() == a.shape[1] + 1) == b.shape[0]
+
 
 def test_broadcast():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
@@ -397,10 +401,14 @@ def test_unravel_index():
 
 
 def test_transpose():
-    b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
-    t = b.T
-    assert np.sum(t[:, -1].asnumpy() == (LARGE_X - 1)) == b.shape[1]
-    assert t.shape == (SMALL_Y, LARGE_X)
+    test_dtypes = [np.float32, np.int64]
+    for dtype in test_dtypes:
+        b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y, dtype=dtype)
+        t = b.T
+        assert t.shape == (SMALL_Y, LARGE_X)
+        ref_out = np.transpose(b.asnumpy())
+        assert_almost_equal(t.asnumpy(), ref_out, rtol=1e-10)
+        
 
 
 def test_swapaxes():
@@ -419,9 +427,10 @@ def test_flip():
 
 def test_softmax():
     input_data = mx.nd.ones((SMALL_Y, LARGE_X))
-    true_output = np.full((SMALL_Y, LARGE_X), (1 / SMALL_Y))
-    output = nd.softmax(input_data, axis=0)
-    assert_almost_equal(output.asnumpy(), true_output, rtol=1e-5, atol=1e-5)
+    for axis in [0, 1]:
+        true_output = np.full((SMALL_Y, LARGE_X), (1 / input_data.shape[axis]))
+        output = nd.softmax(input_data, axis=axis)
+        assert_almost_equal(output.asnumpy(), true_output, rtol=1e-5, atol=1e-5)
 
 
 def test_argsort():
@@ -615,9 +624,16 @@ def testSoftmaxOutput():
 
     sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0,
                                use_ignore=False)
+
     ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd},
-                  args_grad={'x': grad_x})
+                  args_grad=None)
+    ex.forward(is_train=False)
+    softmax_out = ex.outputs[0][0].asnumpy()
+    expected_softmax_out = (1/SMALL_Y)*mx.nd.ones((SMALL_Y)).asnumpy()
+    assert np.isclose(softmax_out, expected_softmax_out).all()
 
+    ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd},
+                  args_grad={'x': grad_x})
     ex.forward(is_train=True)
     softmax_out = ex.outputs[0][0].asnumpy()
     expected_softmax_out = (1/SMALL_Y)*mx.nd.ones((SMALL_Y)).asnumpy()
@@ -666,7 +682,7 @@ def test_pooling():
 
     def test_avg_pooling():
         res = mx.nd.Pooling(a, kernel=(5, 5), pool_type='avg')
-        assert res[-1][-1][-1][-1] == 1.0000001
+        assert int(res[-1][-1][-1][-1].asscalar()) == 1
         assert res.shape[-1] == SMALL_Y - 5 + 1
 
     def test_max_pooling():
@@ -777,8 +793,29 @@ def test_activation():
 # in future, we could test if mean, var of output
 # matches target output's mean, var
 def test_batchnorm():
-    shape = (LARGE_X, SMALL_Y)
+    def get_ref_mean_var(data, running_mean, running_var, eps, use_global_status=True):
+        if not use_global_status:
+            # train mode, calculate the real mean and var
+            mean = nd.mean(data, axis=1, exclude=1)
+            mean_broad = nd.expand_dims(mean, axis=0)
+            mean_broad = nd.expand_dims(mean_broad, axis=2)
+            mean_broad = nd.expand_dims(mean_broad, axis=3)
+            mean_broad = nd.broadcast_like(mean_broad, data)
+            var = nd.multiply(data - mean_broad, data - mean_broad)
+            var = nd.mean(var, axis=1, exclude=1)
+        else:
+            # inference mode, use running_mean and running_var instead
+            mean = nd.full((data.shape[1],), running_mean)
+            var = nd.full((data.shape[1],), running_var)
+        
+        # calculate the inverse of standard variance
+        stdvar = 1. / nd.sqrt(var + eps)
+        nd.waitall()
+        return mean, stdvar
+
+    shape = (MEDIUM_X, MEDIUM_X, SMALL_Y, SMALL_Y)
     axis = 1  # default
+    eps = 1e-3
 
     nch = shape[axis]
     data = mx.nd.ones(shape=shape)
@@ -788,8 +825,20 @@ def test_batchnorm():
     bn_running_var = mx.nd.ones(nch)
 
     output = mx.nd.BatchNorm(data, bn_gamma, bn_beta,
-                             bn_running_mean, bn_running_var)
-    assert output.shape == shape
+                             bn_running_mean, bn_running_var, output_mean_var=True)
+    assert output[0].shape == shape
+    mean, stdvar = output[1], output[2]
+
+    ref_mean, ref_stdvar = get_ref_mean_var(data, bn_running_mean, bn_running_var, eps)
+    assert_almost_equal(mean.asnumpy(), ref_mean.asnumpy())
+    assert_almost_equal(stdvar.asnumpy(), ref_stdvar.asnumpy())
+
+
+def test_elemwise_add():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = nd.elemwise_add(a, b)
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
 
 
 def test_add():
@@ -950,25 +999,29 @@ def test_reshape_like():
 
 
 def test_flatten():
-    a = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y).reshape((LARGE_X//2, 2, SMALL_Y))
-    b = nd.flatten(a)
-    assert b[-1][-1] == (LARGE_X-1)
-    assert b[-1][0] == (LARGE_X-2)
-    assert b.shape == (LARGE_X//2, SMALL_Y*2)
+    test_dtypes = [np.float32, np.int64]
+    for dtype in test_dtypes:
+        a = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y, dtype=dtype).reshape((LARGE_X//2, 2, SMALL_Y))
+        b = nd.flatten(a)
+        assert b.shape == (LARGE_X//2, SMALL_Y*2)
 
 
 def test_expand_dims():
     a = nd.array(np.ones((SMALL_Y, LARGE_X)))
     b = nd.expand_dims(a, axis=1)
     nd.waitall()
+    ref_out = np.expand_dims(a.asnumpy(), axis=1)
     assert b.shape == (SMALL_Y, 1, LARGE_X)
+    assert_almost_equal(b.asnumpy(), ref_out, rtol=1e-10)
 
 
 def test_concat():
     a = nd.array(np.ones((SMALL_Y, LARGE_X)))
     b = nd.array(np.zeros((SMALL_Y, LARGE_X)))
-    c = nd.concat(a,b, dim=0)
-    assert c.shape == (b.shape[0]*2, LARGE_X)
+    for axis in [0, 1]:
+        c = nd.concat(a, b, dim=axis)
+        assert c.shape[axis] == b.shape[axis] * 2
+        assert c.shape[1-axis] == b.shape[1-axis]
 
 
 def test_stack():
diff --git a/tests/nightly/test_large_array_mkldnn.py b/tests/nightly/test_large_array_mkldnn.py
deleted file mode 100644
index d7027e0f09b0..000000000000
--- a/tests/nightly/test_large_array_mkldnn.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import math
-import numpy as np
-import mxnet as mx
-
-from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_context, check_symbolic_forward, create_2d_tensor
-from mxnet import gluon, nd
-from tests.python.unittest.common import with_seed
-
-# dimension constants
-MEDIUM_X = 10000
-LARGE_X = 100000000
-SMALL_X = 100
-SMALL_Y = 50
-LARGE_SIZE = LARGE_X * SMALL_Y
-
-def test_flatten():
-    a = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y, dtype=np.float32).reshape((LARGE_X//2, 2, SMALL_Y))
-    b = nd.flatten(a)
-    assert b.shape == (LARGE_X//2, SMALL_Y*2)
-
-
-def test_expand_dims():
-    a = nd.array(np.ones((SMALL_Y, LARGE_X), dtype=np.float32))
-    b = nd.expand_dims(a, axis=1)
-    nd.waitall()
-    ref_out = np.expand_dims(a.asnumpy(), axis=1)
-    assert b.shape == (SMALL_Y, 1, LARGE_X)
-    assert_almost_equal(b.asnumpy(), ref_out, rtol=1e-10)
-
-
-def test_concat():
-    def ref_concat(a, b, axis):
-      return np.concatenate((a, b), axis=axis)
-
-    a_sym = mx.sym.Variable("a")
-    b_sym = mx.sym.Variable("b")
-    dshape = (SMALL_Y, LARGE_X)
-    a_shape = tuple(dshape)
-    b_shape = tuple(dshape)
-
-    for axis in range(0, 2):
-      z = mx.sym.concat(a_sym, b_sym, dim=axis)
-      a = np.random.uniform(-1, 1, a_shape)
-      b = np.random.uniform(-1, 1, b_shape)
-      exe = z.simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
-      out = exe.forward(is_train=False, a=a, b=b)
-      ref_out = ref_concat(a, b, axis=axis)
-      out = out[0].asnumpy()
-      assert_almost_equal(out, ref_out)
-
-
-def test_softmax():
-    input_data = mx.nd.ones((SMALL_Y, LARGE_X), dtype=np.float32)
-    true_output = np.full((SMALL_Y, LARGE_X), (1 / LARGE_X))
-    output = nd.softmax(input_data, axis=1)
-    assert_almost_equal(output.asnumpy(), true_output, rtol=1e-5, atol=1e-5)
-
-
-def test_softmaxOutput():
-    x = mx.sym.Variable('x')
-    label = mx.sym.Variable('label')
-    x_nd = mx.nd.ones((LARGE_X, SMALL_Y))
-    grad_x = mx.nd.zeros((LARGE_X, SMALL_Y))
-    label_nd = mx.nd.ones((LARGE_X))
-
-    sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0,
-                               use_ignore=False)
-    ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd},
-                  args_grad={'x': grad_x})
-
-    ex.forward(is_train=False)
-    softmax_out = ex.outputs[0][0].asnumpy()
-    expected_softmax_out = (1/SMALL_Y)*mx.nd.ones((SMALL_Y)).asnumpy()
-    assert np.isclose(softmax_out, expected_softmax_out).all()
-
-
-def test_transpose():
-    b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y, dtype=np.float32)
-    t = b.T
-    assert t.shape == (SMALL_Y, LARGE_X)
-    ref_out = np.transpose(b.asnumpy())
-    assert_almost_equal(t.asnumpy(), ref_out, rtol=1e-10)
-
-
-def test_elemwise_add():
-    a = nd.ones(shape=(LARGE_X, SMALL_Y))
-    b = nd.ones(shape=(LARGE_X, SMALL_Y))
-    res = nd.elemwise_add(a, b)
-    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
-
-
-@with_seed()
-def test_fully_connected(ctx=mx.cpu(0)):
-  data_shape = (100, 50*1000*1000)
-  weight_shape = (50, data_shape[1])
-  bias_shape = (weight_shape[0],)
-  data = nd.random.uniform(shape=data_shape)
-  weight = nd.random.uniform(shape=weight_shape)
-  bias = nd.random.uniform(shape=bias_shape)
-  fc_out = nd.FullyConnected(data=data, weight=weight, bias=bias, num_hidden=weight_shape[0], no_bias=False)
-
-  data_npy = data.asnumpy()
-  weight_npy = np.transpose(weight.asnumpy())
-  bias_npy = bias.asnumpy()
-  ref_out = np.dot(data_npy, weight_npy) + bias_npy
-  assert_almost_equal(fc_out.asnumpy(), ref_out, rtol=1e-4)
-
-
-def test_pooling():
-  a = mx.nd.ones((MEDIUM_X, MEDIUM_X, SMALL_Y, SMALL_Y))
-
-  def test_avg_pooling():
-    res = mx.nd.Pooling(a, kernel=(5, 5), pool_type='avg')
-    assert nd.equal(res[-1][-1][-1][-1].asscalar(), 1.0)
-    assert res.shape[-1] == SMALL_Y - 5 + 1
-
-  def test_max_pooling():
-    res = mx.nd.Pooling(a, kernel=(5, 5), pool_type='max')
-    assert res[-1][-1][-1][-1] == 1.
-    assert res.shape[-1] == SMALL_Y - 5 + 1
-
-  test_avg_pooling()
-  test_max_pooling()
-
-
-def test_batchnorm():
-  data_shape = (MEDIUM_X, MEDIUM_X, SMALL_Y, SMALL_Y)
-  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-  weight = mx.symbol.Variable('weight')
-  bias = mx.symbol.Variable('bias')
-  conv1= mx.symbol.Convolution(data=data, weight=weight, bias=bias, name='conv1', num_filter=64, kernel=(3,3), stride=(1,1))
-  bn1 = mx.symbol.BatchNorm(data=conv1, name='bn1', use_global_stats=True, fix_gamma=False)
-  arg_shapes, out_shapes, aux_shapes = bn1.infer_shape(data=data_shape)
-  arg_names = bn1.list_arguments()
-  aux_names = bn1.list_auxiliary_states()
-  bn_exe = bn1.simple_bind(mx.current_context(), shape=data_shape)
-  # args
-  data = mx.nd.random.uniform(shape=arg_shapes[0])
-  weight = mx.nd.random.uniform(shape=arg_shapes[1])
-  bias = mx.nd.random.uniform(shape=arg_shapes[2])
-  gamma = mx.nd.random.uniform(shape=arg_shapes[3])
-  beta = mx.nd.random.uniform(shape=arg_shapes[4])
-  # auxs
-  moving_mean = mx.nd.zeros(shape=aux_shapes[0])
-  moving_var = mx.nd.ones(shape=aux_shapes[1])
-
-  bn_exe.arg_dict[arg_names[0]][:] = data
-  bn_exe.arg_dict[arg_names[1]][:] = weight
-  bn_exe.arg_dict[arg_names[2]][:] = bias
-  bn_exe.arg_dict[arg_names[3]][:] = gamma
-  bn_exe.arg_dict[arg_names[4]][:] = beta
-  bn_exe.aux_dict[aux_names[0]][:] = moving_mean
-  bn_exe.aux_dict[aux_names[1]][:] = moving_var
-
-  p = bn_exe.forward(is_train=False)
-  p[0].wait_to_read()
-
-
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()