diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 911dd84e2f05..460584bc8b78 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -457,15 +457,15 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
 
-    max_threads = int(math.sqrt(
-        tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
-    k = bx * max_threads + tx
+    j = bx * max_threads + tx
 
     iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
     top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
@@ -480,22 +480,22 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             nkeep = if_then_else( \
                     tvm.all(top_k > 0, top_k < valid_count[i]),
                     top_k, valid_count[i])
-            with ib.for_range(0, nkeep) as j:
-                with ib.if_scope(k < box_data_length):
+            with ib.if_scope(j < nkeep):
+                with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + j * box_data_length + k)] = \
                     data[(base_idx + sorted_index[i * num_anchors + j] \
                     * box_data_length + k)]
                 box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
             with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
-                with ib.for_range(0, valid_count[i] - nkeep) as j:
-                    with ib.if_scope(k < box_data_length):
+                with ib.if_scope(j < valid_count[i] - nkeep):
+                    with ib.for_range(0, box_data_length) as k:
                         out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
                     box_indices[i * num_anchors + (j + nkeep)] = -1
             # Apply nms
-            with ib.for_range(0, valid_count[i]) as j:
+            with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
                 with ib.if_scope(out[base_idx + offset_j] >= 0):
-                    with ib.if_scope(k < valid_count[i]):
+                    with ib.for_range(0, valid_count[i]) as k:
                         offset_k = k * box_data_length
                         with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \
 						 tvm.any(force_suppress > 0, id_index < 0, \
@@ -506,35 +506,29 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                             with ib.if_scope(iou >= iou_threshold):
                                 out[base_idx + offset_k] = -1.0
                                 box_indices[i * num_anchors + k] = -1
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
         with ib.else_scope():
-            with ib.for_range(0, valid_count[i]) as j:
+            with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
-                with ib.if_scope(k < box_data_length):
+                with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
                 box_indices[i * num_anchors + j] = j
         # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - valid_count[i]) as j:
-            with ib.if_scope(k < box_data_length):
+        with ib.if_scope(j < num_anchors - valid_count[i]):
+            with ib.for_range(0, box_data_length) as k:
                 out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
             box_indices[i * num_anchors + j + valid_count[i]] = -1
         # Only return max_output_size number of valid boxes
         num_valid_boxes[0] = 0
         with ib.if_scope(max_output_size > 0):
-            with ib.for_range(0, valid_count[i]) as j:
+            with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
                 with ib.if_scope(out[base_idx + offset_j] >= 0):
                     with ib.if_scope(num_valid_boxes[0] == max_output_size):
-                        with ib.if_scope(k < box_data_length):
+                        with ib.for_range(0, box_data_length) as k:
                             out[base_idx + offset_j + k] = -1.0
                         box_indices[i * num_anchors + j] = -1
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
 
     return ib.get()