diff --git a/keras/backend/cntk_backend.py b/keras/backend/cntk_backend.py
index fe89045a941..35f1dd2a7d6 100644
--- a/keras/backend/cntk_backend.py
+++ b/keras/backend/cntk_backend.py
@@ -953,6 +953,9 @@ def normalize_batch_in_training(x, gamma, beta,
         for axis in range(1, ndim(x)):
             if axis in reduction_axes:
                 target_shape.append(1)
+                if ndim(gamma) > axis:
+                    gamma = C.reduce_mean(gamma, axis - 1)
+                    beta = C.reduce_mean(beta, axis - 1)
             else:
                 target_shape.append(x_shape[axis])
 
diff --git a/tests/keras/backend/backend_test.py b/tests/keras/backend/backend_test.py
index f54d7c54328..e7a8459d822 100644
--- a/tests/keras/backend/backend_test.py
+++ b/tests/keras/backend/backend_test.py
@@ -1089,15 +1089,21 @@ def test_batchnorm(self):
                 x_shape = (1, 4) + shape
             else:
                 x_shape = (1,) + shape + (4,)
-            xth = KTH.variable(np.random.random(x_shape))
-            xtf = KTF.variable(np.random.random(x_shape))
+            x_val = np.random.random(x_shape).astype(np.float32)
+            xth = KTH.variable(x_val)
+            xtf = KTF.variable(x_val)
+            xc = KC.placeholder(x_shape)
             zth, _, _ = KTH.normalize_batch_in_training(xth, None, None,
                                                         reduction_axes='per-activation')
             ztf, _, _ = KTF.normalize_batch_in_training(xtf, None, None,
                                                         reduction_axes=[0, 1, 2, 3])
+            zc, _, _ = KC.normalize_batch_in_training(xc, None, None,
+                                                      reduction_axes=[0, 1, 2, 3])
             zth = KTH.eval(zth)
             ztf = KTF.eval(ztf)
+            zc = KC.function([xc], [zc])([x_val])[0]
             assert zth.shape == ztf.shape
+            assert zth.shape == zc.shape
 
     def test_ctc(self):
         # simplified version of TensorFlow's test