diff --git a/cub/agent/agent_merge_sort.cuh b/cub/agent/agent_merge_sort.cuh
new file mode 100644
index 0000000000..037dd6f70c
--- /dev/null
+++ b/cub/agent/agent_merge_sort.cuh
@@ -0,0 +1,733 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_merge_sort.cuh"
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+  int                      _BLOCK_THREADS,
+  int                      _ITEMS_PER_THREAD = 1,
+  cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+  cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+  cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+struct AgentMergeSortPolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+template <class Policy,
+          class It,
+          class T = typename std::iterator_traits<It>::value_type>
+struct BlockLoadType
+{
+  using type = cub::BlockLoad<T,
+                              Policy::BLOCK_THREADS,
+                              Policy::ITEMS_PER_THREAD,
+                              Policy::LOAD_ALGORITHM>;
+};
+
+template <class Policy,
+          class It,
+          class T = typename std::iterator_traits<It>::value_type>
+struct BlockStoreType
+{
+  using type = cub::BlockStore<T,
+                               Policy::BLOCK_THREADS,
+                               Policy::ITEMS_PER_THREAD,
+                               Policy::STORE_ALGORITHM>;
+};
+
+template <
+  typename Policy,
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+struct AgentBlockSort
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  static constexpr bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+  /// The input value type
+  using BlockMergeSortT =
+    BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
+
+  using KeysLoadIt  = typename thrust::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadIt = typename thrust::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
+
+  using BlockLoadKeys  = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
+  using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
+
+  using BlockStoreKeysIt   = typename cub::BlockStoreType<Policy, KeyIteratorT>::type;
+  using BlockStoreItemsIt  = typename cub::BlockStoreType<Policy, ValueIteratorT>::type;
+  using BlockStoreKeysRaw  = typename cub::BlockStoreType<Policy, KeyT *>::type;
+  using BlockStoreItemsRaw = typename cub::BlockStoreType<Policy, ValueT *>::type;
+
+  union _TempStorage
+  {
+    typename BlockLoadKeys::TempStorage load_keys;
+    typename BlockLoadItems::TempStorage load_items;
+    typename BlockStoreKeysIt::TempStorage store_keys_it;
+    typename BlockStoreItemsIt::TempStorage store_items_it;
+    typename BlockStoreKeysRaw::TempStorage store_keys_raw;
+    typename BlockStoreItemsRaw::TempStorage store_items_raw;
+    typename BlockMergeSortT::TempStorage block_merge;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE = sizeof(TempStorage);
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool ping;
+  _TempStorage &storage;
+  KeysLoadIt keys_in;
+  ItemsLoadIt items_in;
+  OffsetT keys_count;
+  KeyIteratorT keys_out_it;
+  ValueIteratorT items_out_it;
+  KeyT *keys_out_raw;
+  ValueT *items_out_raw;
+  CompareOpT compare_op;
+
+  __device__ __forceinline__ AgentBlockSort(bool ping_,
+                                            TempStorage &storage_,
+                                            KeysLoadIt keys_in_,
+                                            ItemsLoadIt items_in_,
+                                            OffsetT keys_count_,
+                                            KeyIteratorT keys_out_it_,
+                                            ValueIteratorT items_out_it_,
+                                            KeyT *keys_out_raw_,
+                                            ValueT *items_out_raw_,
+                                            CompareOpT compare_op_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in(keys_in_)
+      , items_in(items_in_)
+      , keys_count(keys_count_)
+      , keys_out_it(keys_out_it_)
+      , items_out_it(items_out_it_)
+      , keys_out_raw(keys_out_raw_)
+      , items_out_raw(items_out_raw_)
+      , compare_op(compare_op_)
+  {
+  }
+
+  __device__ __forceinline__ void Process()
+  {
+    int tid = threadIdx.x;
+    OffsetT tile_idx = blockIdx.x;
+    OffsetT num_tiles = gridDim.x;
+    OffsetT tile_base = tile_idx * ITEMS_PER_TILE;
+    int items_in_tile = cub::min<int>(keys_count - tile_base,
+                                      int(ITEMS_PER_TILE));
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<false>(tid, tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<true>(tid, tile_base, items_in_tile);
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void consume_tile(int tid,
+                                               OffsetT tile_base,
+                                               int num_remaining)
+  {
+    ValueT items_loc[ITEMS_PER_THREAD];
+    if (!KEYS_ONLY)
+    {
+      BlockLoadItems(storage.load_items)
+        .Load(items_in + tile_base,
+              items_loc,
+              num_remaining,
+              *(items_in + tile_base));
+
+      CTA_SYNC();
+    }
+
+    KeyT keys_loc[ITEMS_PER_THREAD];
+    if (IS_LAST_TILE)
+    {
+      BlockLoadKeys(storage.load_keys)
+        .Load(keys_in + tile_base,
+              keys_loc,
+              num_remaining,
+              *(keys_in + tile_base));
+    }
+    else
+    {
+      BlockLoadKeys(storage.load_keys)
+        .Load(keys_in + tile_base, keys_loc);
+    }
+
+    if (IS_LAST_TILE)
+    {
+      // if last tile, find valid max_key
+      // and fill the remainig keys with it
+      //
+      KeyT max_key = keys_loc[0];
+#pragma unroll
+      for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
+        {
+          max_key = compare_op(max_key, keys_loc[ITEM])
+                  ? keys_loc[ITEM]
+                  : max_key;
+        }
+        else
+        {
+          keys_loc[ITEM] = max_key;
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+    if (IS_LAST_TILE)
+    {
+      BlockMergeSortT(storage.block_merge).Sort(num_remaining, keys_loc, items_loc, compare_op);
+    }
+    else
+    {
+      BlockMergeSortT(storage.block_merge).Sort(keys_loc, items_loc, compare_op);
+    }
+
+    CTA_SYNC();
+
+    if (ping)
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockStoreKeysIt(storage.store_keys_it)
+          .Store(keys_out_it + tile_base, keys_loc, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysIt(storage.store_keys_it)
+          .Store(keys_out_it + tile_base, keys_loc);
+      }
+
+      if (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        BlockStoreItemsIt(storage.store_items_it)
+          .Store(items_out_it + tile_base, items_loc, num_remaining);
+      }
+    }
+    else
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw)
+          .Store(keys_out_raw + tile_base, keys_loc, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw)
+          .Store(keys_out_raw + tile_base, keys_loc);
+      }
+
+      if (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        BlockStoreItemsRaw(storage.store_items_raw)
+          .Store(items_out_raw + tile_base, items_loc, num_remaining);
+      }
+    }
+  }
+};
+
+template <
+  typename KeyIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT>
+struct AgentPartition
+{
+  bool ping;
+  KeyIteratorT keys_ping;
+  KeyT *keys_pong;
+  OffsetT keys_count;
+  OffsetT partition_idx;
+  OffsetT *merge_partitions;
+  CompareOpT compare_op;
+  OffsetT coop;
+  int items_per_tile;
+
+  __device__ __forceinline__ AgentPartition(bool ping,
+                                            KeyIteratorT keys_ping,
+                                            KeyT *keys_pong,
+                                            OffsetT keys_count,
+                                            OffsetT partition_idx,
+                                            OffsetT *merge_partitions,
+                                            CompareOpT compare_op,
+                                            OffsetT coop,
+                                            int items_per_tile)
+      : ping(ping)
+      , keys_ping(keys_ping)
+      , keys_pong(keys_pong)
+      , keys_count(keys_count)
+      , partition_idx(partition_idx)
+      , merge_partitions(merge_partitions)
+      , compare_op(compare_op)
+      , coop(coop)
+      , items_per_tile(items_per_tile)
+  {}
+
+  __device__ __forceinline__ void Process()
+  {
+    OffsetT list  = ~(coop - 1) & partition_idx;
+    OffsetT start = items_per_tile * list;
+    OffsetT size  = items_per_tile * (coop >> 1);
+
+    OffsetT keys1_beg = cub::min<OffsetT>(keys_count, start);
+    OffsetT keys1_end = cub::min<OffsetT>(keys_count, start + size);
+    OffsetT keys2_beg = keys1_end;
+    OffsetT keys2_end = cub::min<OffsetT>(keys_count, keys2_beg + size);
+
+    OffsetT partition_at =
+      cub::min<OffsetT>(keys2_end - keys1_beg,
+                        items_per_tile * ((coop - 1) & partition_idx));
+
+    OffsetT partition_diag = ping ? MergePath<KeyT>(keys_ping + keys1_beg,
+                                                    keys_ping + keys2_beg,
+                                                    keys1_end - keys1_beg,
+                                                    keys2_end - keys2_beg,
+                                                    partition_at,
+                                                    compare_op)
+                                  : MergePath<KeyT>(keys_pong + keys1_beg,
+                                                    keys_pong + keys2_beg,
+                                                    keys1_end - keys1_beg,
+                                                    keys2_end - keys2_beg,
+                                                    partition_at,
+                                                    compare_op);
+
+    merge_partitions[partition_idx] = keys1_beg + partition_diag;
+  }
+};
+
+template <
+  typename Policy,
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+struct AgentMerge
+{
+
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+  using KeysLoadPingIt  = typename thrust::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadPingIt = typename thrust::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
+  using KeysLoadPongIt  = typename thrust::cuda_cub::core::LoadIterator<Policy, KeyT *>::type;
+  using ItemsLoadPongIt = typename thrust::cuda_cub::core::LoadIterator<Policy, ValueT *>::type;
+
+  using KeysOutputPongIt  = KeyIteratorT;
+  using ItemsOutputPongIt = ValueIteratorT;
+  using KeysOutputPingIt  = KeyT*;
+  using ItemsOutputPingIt = ValueT*;
+
+  using BlockStoreKeysPong  = typename BlockStoreType<Policy, KeysOutputPongIt>::type;
+  using BlockStoreItemsPong = typename BlockStoreType<Policy, ItemsOutputPongIt>::type;
+  using BlockStoreKeysPing  = typename BlockStoreType<Policy, KeysOutputPingIt>::type;
+  using BlockStoreItemsPing = typename BlockStoreType<Policy, ItemsOutputPingIt>::type;
+
+  /// Parameterized BlockReduce primitive
+
+  union _TempStorage
+  {
+    typename BlockStoreKeysPing::TempStorage  store_keys_ping;
+    typename BlockStoreItemsPing::TempStorage store_items_ping;
+    typename BlockStoreKeysPong::TempStorage  store_keys_pong;
+    typename BlockStoreItemsPong::TempStorage store_items_pong;
+
+    KeyT keys_shared[Policy::ITEMS_PER_TILE + 1];
+    ValueT items_shared[Policy::ITEMS_PER_TILE + 1];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  static constexpr bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+  static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE = sizeof(TempStorage);
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool            ping;
+  _TempStorage&   storage;
+
+  KeysLoadPingIt  keys_in_ping;
+  ItemsLoadPingIt items_in_ping;
+  KeysLoadPongIt  keys_in_pong;
+  ItemsLoadPongIt items_in_pong;
+
+  OffsetT keys_count;
+
+  KeysOutputPongIt  keys_out_pong;
+  ItemsOutputPongIt items_out_pong;
+  KeysOutputPingIt  keys_out_ping;
+  ItemsOutputPingIt items_out_ping;
+
+  CompareOpT compare_op;
+  OffsetT *merge_partitions;
+  OffsetT coop;
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  template <bool IS_FULL_TILE, class T, class It1, class It2>
+  __device__ __forceinline__ void
+  gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+              It1 input1,
+              It2 input2,
+              int count1,
+              int count2)
+  {
+    if (IS_FULL_TILE)
+    {
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+        output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
+      }
+    }
+    else
+    {
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+        if (idx < count1 + count2)
+        {
+          output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
+        }
+      }
+    }
+  }
+
+  template <class T, class It>
+  __device__ __forceinline__ void
+  reg_to_shared(It output,
+                T (&input)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+      output[idx] = input[ITEM];
+    }
+  }
+
+  template <bool IS_FULL_TILE>
+  __device__ __forceinline__ void
+  consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count)
+  {
+    OffsetT partition_beg = merge_partitions[tile_idx + 0];
+    OffsetT partition_end = merge_partitions[tile_idx + 1];
+
+    OffsetT list  = ~(coop - 1) & tile_idx;
+    OffsetT start = ITEMS_PER_TILE * list;
+    OffsetT size  = ITEMS_PER_TILE * (coop >> 1);
+
+    OffsetT diag = ITEMS_PER_TILE * tile_idx - start;
+
+    OffsetT keys1_beg = partition_beg;
+    OffsetT keys1_end = partition_end;
+    OffsetT keys2_beg = cub::min<OffsetT>(keys_count, 2 * start + size + diag - partition_beg);
+    OffsetT keys2_end = cub::min<OffsetT>(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end);
+
+    if (coop - 1 == ((coop - 1) & tile_idx))
+    {
+      keys1_end = cub::min<OffsetT>(keys_count, start + size);
+      keys2_end = cub::min<OffsetT>(keys_count, start + size * 2);
+    }
+
+    // number of keys per tile
+    //
+    int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+    // load keys1 & keys2
+    KeyT keys_loc[ITEMS_PER_THREAD];
+    if (ping)
+    {
+      gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                keys_in_ping + keys1_beg,
+                                keys_in_ping + keys2_beg,
+                                num_keys1,
+                                num_keys2);
+    }
+    else
+    {
+      gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                keys_in_pong + keys1_beg,
+                                keys_in_pong + keys2_beg,
+                                num_keys1,
+                                num_keys2);
+    }
+    reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+    // preload items into registers already
+    //
+    ValueT items_loc[ITEMS_PER_THREAD];
+    if (!KEYS_ONLY)
+    {
+      if (ping)
+      {
+        gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                  items_in_ping + keys1_beg,
+                                  items_in_ping + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+      }
+      else
+      {
+        gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                  items_in_pong + keys1_beg,
+                                  items_in_pong + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+      }
+    }
+
+    CTA_SYNC();
+
+    // use binary search in shared memory
+    // to find merge path for each of thread
+    // we can use int type here, because the number of
+    // items in shared memory is limited
+    //
+    int diag0_loc = cub::min<OffsetT>(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+
+    int keys1_beg_loc = MergePath<KeyT>(&storage.keys_shared[0],
+                                        &storage.keys_shared[num_keys1],
+                                        num_keys1,
+                                        num_keys2,
+                                        diag0_loc,
+                                        compare_op);
+    int keys1_end_loc = num_keys1;
+    int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+    int keys2_end_loc = num_keys2;
+
+    int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+    int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+    // perform serial merge
+    //
+    int indices[ITEMS_PER_THREAD];
+
+    SerialMerge(&storage.keys_shared[0],
+                keys1_beg_loc,
+                keys2_beg_loc + num_keys1,
+                num_keys1_loc,
+                num_keys2_loc,
+                keys_loc,
+                indices,
+                compare_op);
+
+    CTA_SYNC();
+
+    // write keys
+    //
+    if (ping)
+    {
+      if (IS_FULL_TILE)
+      {
+        BlockStoreKeysPing(storage.store_keys_ping)
+          .Store(keys_out_ping + tile_base, keys_loc);
+      }
+      else
+      {
+        BlockStoreKeysPing(storage.store_keys_ping)
+          .Store(keys_out_ping + tile_base, keys_loc, num_keys1 + num_keys2);
+      }
+    }
+    else
+    {
+      if (IS_FULL_TILE)
+      {
+        BlockStoreKeysPong(storage.store_keys_pong)
+          .Store(keys_out_pong + tile_base, keys_loc);
+      }
+      else
+      {
+        BlockStoreKeysPong(storage.store_keys_pong)
+          .Store(keys_out_pong + tile_base, keys_loc, num_keys1 + num_keys2);
+      }
+    }
+
+    // if items are provided, merge them
+    if (!KEYS_ONLY)
+    {
+      CTA_SYNC();
+
+      reg_to_shared(&storage.items_shared[0], items_loc);
+
+      CTA_SYNC();
+
+      // gather items from shared mem
+      //
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+      }
+
+      CTA_SYNC();
+
+      // write from reg to gmem
+      //
+      if (ping)
+      {
+        if (IS_FULL_TILE)
+        {
+          BlockStoreItemsPing(storage.store_items_ping)
+            .Store(items_out_ping + tile_base, items_loc);
+        }
+        else
+        {
+          BlockStoreItemsPing(storage.store_items_ping)
+            .Store(items_out_ping + tile_base, items_loc, count);
+        }
+      }
+      else
+      {
+        if (IS_FULL_TILE)
+        {
+          BlockStoreItemsPong(storage.store_items_pong)
+            .Store(items_out_pong + tile_base, items_loc);
+        }
+        else
+        {
+          BlockStoreItemsPong(storage.store_items_pong)
+            .Store(items_out_pong + tile_base, items_loc, count);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ AgentMerge(bool ping_,
+                                        TempStorage &storage_,
+                                        KeysLoadPingIt keys_in_ping_,
+                                        ItemsLoadPingIt items_in_ping_,
+                                        KeysLoadPongIt keys_in_pong_,
+                                        ItemsLoadPongIt items_in_pong_,
+                                        OffsetT keys_count_,
+                                        KeysOutputPingIt keys_out_ping_,
+                                        ItemsOutputPingIt items_out_ping_,
+                                        KeysOutputPongIt keys_out_pong_,
+                                        ItemsOutputPongIt items_out_pong_,
+                                        CompareOpT compare_op_,
+                                        OffsetT *merge_partitions_,
+                                        OffsetT coop_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in_ping(keys_in_ping_)
+      , items_in_ping(items_in_ping_)
+      , keys_in_pong(keys_in_pong_)
+      , items_in_pong(items_in_pong_)
+      , keys_count(keys_count_)
+      , keys_out_pong(keys_out_pong_)
+      , items_out_pong(items_out_pong_)
+      , keys_out_ping(keys_out_ping_)
+      , items_out_ping(items_out_ping_)
+      , compare_op(compare_op_)
+      , merge_partitions(merge_partitions_)
+      , coop(coop_)
+  {}
+
+  __device__ __forceinline__ void Process()
+  {
+    // XXX with 8.5 chaging type to Size (or long long) results in error!
+    int tile_idx      = blockIdx.x;
+    OffsetT num_tiles = gridDim.x;
+    OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
+    int tid           = threadIdx.x;
+    int items_in_tile = static_cast<int>(
+      cub::min<OffsetT>((OffsetT)ITEMS_PER_TILE, keys_count - tile_base));
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<true>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<false>(tid, tile_idx, tile_base, items_in_tile);
+    }
+  }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
new file mode 100644
index 0000000000..8fd88e7699
--- /dev/null
+++ b/cub/block/block_merge_sort.cuh
@@ -0,0 +1,468 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <typename KeyT,
+          typename KeyIteratorT,
+          typename OffsetT,
+          typename BinaryPred>
+__device__ __forceinline__ OffsetT MergePath(KeyIteratorT keys1,
+                                             KeyIteratorT keys2,
+                                             OffsetT keys1_count,
+                                             OffsetT keys2_count,
+                                             OffsetT diag,
+                                             BinaryPred binary_pred)
+{
+  OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
+  OffsetT keys1_end   = cub::min<OffsetT>(diag, keys1_count);
+
+  while (keys1_begin < keys1_end)
+  {
+    OffsetT mid = (keys1_begin + keys1_end) >> 1;
+    KeyT key1   = keys1[mid];
+    KeyT key2   = keys2[diag - 1 - mid];
+    bool pred   = binary_pred(key2, key1);
+
+    if (pred)
+    {
+      keys1_end = mid;
+    }
+    else
+    {
+      keys1_begin = mid + 1;
+    }
+  }
+  return keys1_begin;
+}
+
+template <typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+                                            int keys1_beg,
+                                            int keys2_beg,
+                                            int keys1_count,
+                                            int keys2_count,
+                                            KeyT (&output)[ITEMS_PER_THREAD],
+                                            int (&indices)[ITEMS_PER_THREAD],
+                                            CompareOp compare_op)
+{
+  int keys1_end = keys1_beg + keys1_count;
+  int keys2_end = keys2_beg + keys2_count;
+
+  KeyT key1 = keys_shared[keys1_beg];
+  KeyT key2 = keys_shared[keys2_beg];
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    bool p = (keys2_beg < keys2_end) &&
+             ((keys1_beg >= keys1_end)
+              || compare_op(key2, key1));
+
+    output[ITEM]  = p ? key2 : key1;
+    indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
+
+    if (p)
+    {
+      key2 = keys_shared[keys2_beg];
+    }
+    else
+    {
+      key1 = keys_shared[keys1_beg];
+    }
+  }
+}
+
+/**
+ * \brief The BlockMergeSort class provides methods for sorting items partitioned across a CUDA thread block using a merge sorting method.
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * \par Overview
+ * - The merge sorting method arranges items into ascending order.
+ *   Prefer using cub::BlockRadixSort for arithmetic types sorting if you
+ *   don't have to use custom comparators.
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockMergeSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that are
+ * partitioned across 128 threads * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>  // or equivalently <cub/block/block_merge_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockMergeSort<int, 128, 4> BlockMergeSort;
+ *
+ *     // Allocate shared memory for BlockMergeSort
+ *     __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     BlockMergeSort(temp_storage_shuffle).Sort(thread_data, CustomLess());
+ *     ...
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+ * The corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+  typename  KeyT,
+  int       BLOCK_DIM_X,
+  int       ITEMS_PER_THREAD,
+  typename  ValueT            = NullType,
+  int       BLOCK_DIM_Y       = 1,
+  int       BLOCK_DIM_Z       = 1>
+class BlockMergeSort
+{
+private:
+
+  // The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS;
+
+  // Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    KeyT keys_shared[ITEMS_PER_TILE + 1];
+    ValueT items_shared[ITEMS_PER_TILE + 1];
+  }; // union TempStorage
+
+  /// Internal storage allocator
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+public:
+
+  /// \smemstorage{BlockMergeSort}
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  __device__ __forceinline__ BlockMergeSort()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  __device__ __forceinline__ BlockMergeSort(TempStorage &temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+private:
+
+  template <typename T>
+  __device__ __forceinline__ void Swap(T &lhs, T &rhs)
+  {
+    T temp = lhs;
+    lhs    = rhs;
+    rhs    = temp;
+  }
+
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                    ValueT (&items)[ITEMS_PER_THREAD],
+                    CompareOp compare_op)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+    {
+#pragma unroll
+      for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+      {
+        if (compare_op(keys[j + 1], keys[j]))
+        {
+          Swap(keys[j], keys[j + 1]);
+          if (!KEYS_ONLY)
+          {
+            Swap(items[j], items[j + 1]);
+          }
+        }
+      } // inner loop
+    }   // outer loop
+  }
+
+public:
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - Sort is not guaranteed to be stable. That is, suppose that i and j are
+   *   equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  Sort(KeyT (&keys_loc)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort
+       CompareOp compare_op)               ///< [in] Comparison function object which returns
+                                           ///< true if the first argument is ordered before
+                                           ///< the second
+  {
+    ValueT items_loc[ITEMS_PER_THREAD];
+    Sort<CompareOp, false>(ITEMS_PER_TILE, keys_loc, items_loc, compare_op);
+  }
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - Sort is not guaranteed to be stable. That is, suppose that i and j are
+   *   equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  Sort(KeyT (&keys_loc)[ITEMS_PER_THREAD],       ///< [in-out] Keys to sort
+       ValueT (&items_loc)[ITEMS_PER_THREAD],    ///< [in-out] Values to sort
+       CompareOp compare_op)                     ///< [in] Comparison function object which returns
+                                                 ///< true if the first argument is ordered before
+                                                 ///< the second
+  {
+    Sort<CompareOp, false>(ITEMS_PER_TILE, keys_loc, items_loc, compare_op);
+  }
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - Sort is not guaranteed to be stable. That is, suppose that i and j are
+   *   equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   * \tparam IS_LAST_TILE True if valid_items isn't equal to the ITEMS_PER_TILE
+   */
+  template <typename CompareOp,
+            bool IS_LAST_TILE = true>
+  __device__ __forceinline__ void
+  Sort(int valid_items,                        ///< [in] Number of valid items to sort
+       KeyT (&keys_loc)[ITEMS_PER_THREAD],     ///< [in-out] Keys to sort
+       ValueT (&items_loc)[ITEMS_PER_THREAD],  ///< [in-out] Values to sort
+       CompareOp compare_op)                   ///< [in] Comparison function object which returns
+                                               ///< true if the first argument is ordered before
+                                               ///< the second
+  {
+    // if first element of thread is in input range, stable sort items
+    //
+    if (!IS_LAST_TILE || ITEMS_PER_THREAD * linear_tid < valid_items)
+    {
+      StableOddEvenSort(keys_loc, items_loc, compare_op);
+    }
+
+    // each thread has  sorted keys_loc
+    // merge sort keys_loc in shared memory
+    //
+#pragma unroll
+    for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
+    {
+      CTA_SYNC();
+
+      // store keys in shmem
+      //
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        int idx                       = ITEMS_PER_THREAD * linear_tid + ITEM;
+        temp_storage.keys_shared[idx] = keys_loc[ITEM];
+      }
+
+      CTA_SYNC();
+
+      int indices[ITEMS_PER_THREAD];
+
+      int list  = ~(coop - 1) & linear_tid;
+      int start = ITEMS_PER_THREAD * list;
+      int size  = ITEMS_PER_THREAD * (coop >> 1);
+
+      int diag = cub::min<int>(valid_items,
+                               ITEMS_PER_THREAD * ((coop - 1) & linear_tid));
+
+      int keys1_beg = cub::min<int>(valid_items, start);
+      int keys1_end = cub::min<int>(valid_items, keys1_beg + size);
+      int keys2_beg = keys1_end;
+      int keys2_end = cub::min<int>(valid_items, keys2_beg + size);
+
+      int keys1_count = keys1_end - keys1_beg;
+      int keys2_count = keys2_end - keys2_beg;
+
+      int partition_diag = MergePath<KeyT>(&temp_storage.keys_shared[keys1_beg],
+                                           &temp_storage.keys_shared[keys2_beg],
+                                           keys1_count,
+                                           keys2_count,
+                                           diag,
+                                           compare_op);
+
+      int keys1_beg_loc   = keys1_beg + partition_diag;
+      int keys1_end_loc   = keys1_end;
+      int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+      int keys2_end_loc   = keys2_end;
+      int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+      int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+      SerialMerge(&temp_storage.keys_shared[0],
+                  keys1_beg_loc,
+                  keys2_beg_loc,
+                  keys1_count_loc,
+                  keys2_count_loc,
+                  keys_loc,
+                  indices,
+                  compare_op);
+
+      if (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        // store keys in shmem
+        //
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = ITEMS_PER_THREAD * linear_tid + ITEM;
+          temp_storage.items_shared[idx] = items_loc[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // gather items from shmem
+        //
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          items_loc[ITEM] = temp_storage.items_shared[indices[ITEM]];
+        }
+      }
+    }
+  } // func block_merge_sort
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if x and y are elements such that x precedes y,
+   *   and if the two elements are equivalent (neither x < y nor y < x) then
+   *   a postcondition of stable_sort is that x still precedes y.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  StableSort(KeyT (&keys_loc)[ITEMS_PER_THREAD],   ///< [in-out] Keys to sort
+             CompareOp compare_op)                 ///< [in] Comparison function object which
+                                                   ///< returns true if the first argument is
+                                                   ///< ordered before the second
+  {
+    Sort(keys_loc, compare_op);
+  }
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if x and y are elements such that x precedes y,
+   *   and if the two elements are equivalent (neither x < y nor y < x) then
+   *   a postcondition of stable_sort is that x still precedes y.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  StableSort(KeyT (&keys_loc)[ITEMS_PER_THREAD],    ///< [in-out] Keys to sort
+             ValueT (&items_loc)[ITEMS_PER_THREAD], ///< [in-out] Values to sort
+             CompareOp compare_op)                  ///< [in] Comparison function object which
+                                                    ///< returns true if the first argument is
+                                                    ///< ordered before the second
+  {
+    Sort(keys_loc, items_loc, compare_op);
+  }
+
+  /**
+   * \brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * \par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if x and y are elements such that x precedes y,
+   *   and if the two elements are equivalent (neither x < y nor y < x) then
+   *   a postcondition of stable_sort is that x still precedes y.
+   *
+   * \tparam CompareOp functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void
+  StableSort(int valid_items,                          ///< [in] Number of valid items to sort
+             KeyT (&keys_loc)[ITEMS_PER_THREAD],       ///< [in-out] Keys to sort
+             ValueT (&items_loc)[ITEMS_PER_THREAD],    ///< [in-out] Values to sort
+             CompareOp compare_op)                     ///< [in] Comparison function object which
+                                                       ///< returns true if the first argument is
+                                                       ///< ordered before the second
+  {
+    Sort(valid_items, keys_loc, items_loc, compare_op);
+  }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cub/device/device_merge_sort.cuh b/cub/device/device_merge_sort.cuh
new file mode 100644
index 0000000000..240d37c72e
--- /dev/null
+++ b/cub/device/device_merge_sort.cuh
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "dispatch/dispatch_merge_sort.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief DeviceMergeSort provides device-wide, parallel operations for computing a merge sort across a sequence of data items residing within device-accessible memory.
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * - The merge sorting method arranges items into ascending order.
+ *   Prefer using cub::DeviceRadixSort for arithmetic types sorting if you
+ *   don't have to use custom comparators.
+ */
+struct DeviceMergeSort
+{
+
+  /**
+   * \brief Sorts items using a merge sorting method.
+   *
+   * \par
+   * - SortPairs is not guaranteed to be stable. That is, suppose that i and j are
+   *   equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   *
+   * \par Snippet
+   * The code snippet below illustrates the sorting of a device vector of \p int keys
+   * with associated vector of \p int values.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 2, 1, 0, 6]
+   *
+   * \endcode
+   *
+   * \tparam CompareOpT functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage,            ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+                                                    size_t &temp_storage_bytes,      ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+                                                    KeyIteratorT d_keys,             ///< [in,out] Pointer to the input sequence of unsorted input keys
+                                                    ValueIteratorT d_items,          ///< [in,out] Pointer to the input sequence of unsorted input values
+                                                    OffsetT num_items,               ///< [in] Number of items to sort
+                                                    CompareOpT compare_op,           ///< [in] Comparison function object which returns true if the first argument is ordered before the second
+                                                    cudaStream_t stream = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+                                                    bool debug_synchronous = false)  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+  {
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_items,
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * \brief Sorts items using a merge sorting method.
+   *
+   * \par
+   * - SortKeys is not guaranteed to be stable. That is, suppose that i and j are
+   *   equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   *
+   * \par Snippet
+   * The code snippet below illustrates the sorting of a device vector of \p int keys.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   *
+   * \endcode
+   *
+   * \tparam CompareOpT functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage,            ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+                                                   size_t &temp_storage_bytes,      ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+                                                   KeyIteratorT d_keys,             ///< [in,out] Pointer to the input sequence of unsorted input keys
+                                                   OffsetT num_items,               ///< [in] Number of items to sort
+                                                   CompareOpT compare_op,           ///< [in] Comparison function object which returns true if the first argument is ordered before the second
+                                                   cudaStream_t stream = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+                                                   bool debug_synchronous = false)  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+  {
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyIteratorT, NullType *, OffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        (NullType *)NULL,
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * \brief Sorts items using a merge sorting method.
+   *
+   * \par
+   * - StableSortPairs is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if x and y are elements such that x precedes y,
+   *   and if the two elements are equivalent (neither x < y nor y < x) then
+   *   a postcondition of stable_sort is that x still precedes y.
+   *
+   * \par Snippet
+   * The code snippet below illustrates the sorting of a device vector of \p int keys
+   * with associated vector of \p int values.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 1, 2, 0, 6]
+   *
+   * \endcode
+   *
+   * \tparam CompareOpT functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairs(void *d_temp_storage,            ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+                  size_t &temp_storage_bytes,      ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+                  KeyIteratorT d_keys,             ///< [in,out] Pointer to the input sequence of unsorted input keys
+                  ValueIteratorT d_items,          ///< [in,out] Pointer to the input sequence of unsorted input values
+                  OffsetT num_items,               ///< [in] Number of items to sort
+                  CompareOpT compare_op,           ///< [in] Comparison function object which returns true if the first argument is ordered before the second
+                  cudaStream_t stream = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+                  bool debug_synchronous = false)  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+  {
+    return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_items,
+      num_items,
+      compare_op,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * \brief Sorts items using a merge sorting method.
+   *
+   * \par
+   * - StableSortKeys is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if x and y are elements such that x precedes y,
+   *   and if the two elements are equivalent (neither x < y nor y < x) then
+   *   a postcondition of stable_sort is that x still precedes y.
+   *
+   * \par Snippet
+   * The code snippet below illustrates the sorting of a device vector of \p int keys.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   *
+   * \endcode
+   *
+   * \tparam CompareOpT functor type having member <tt>bool operator()(KeyT lhs, KeyT rhs)</tt>
+   */
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeys(void *d_temp_storage,            ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+                 size_t &temp_storage_bytes,      ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+                 KeyIteratorT d_keys,             ///< [in,out] Pointer to the input sequence of unsorted input keys
+                 OffsetT num_items,               ///< [in] Number of items to sort
+                 CompareOpT compare_op,           ///< [in] Comparison function object which returns true if the first argument is ordered before the second
+                 cudaStream_t stream = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+                 bool debug_synchronous = false)  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+  {
+    return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       d_keys,
+                                                       num_items,
+                                                       compare_op,
+                                                       stream,
+                                                       debug_synchronous);
+  }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh
new file mode 100644
index 0000000000..a2fbb02c19
--- /dev/null
+++ b/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -0,0 +1,706 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../../util_math.cuh"
+#include "../../util_device.cuh"
+#include "../../agent/agent_merge_sort.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/detail/integer_math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <bool UseVShmem,
+          typename ActivePolicyT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+void __global__ __launch_bounds__(ActivePolicyT::BLOCK_THREADS)
+DeviceMergeSortBlockSortKernel(bool ping,
+                               KeyIteratorT keys_inout,
+                               ValueIteratorT items_inout,
+                               OffsetT keys_count,
+                               KeyT *keys_out,
+                               ValueT *items_out,
+                               CompareOpT compare_op,
+                               char *vshmem)
+{
+  extern __shared__ char shmem[];
+
+  using AgentBlockSortT = AgentBlockSort<ActivePolicyT,
+                                         KeyIteratorT,
+                                         ValueIteratorT,
+                                         OffsetT,
+                                         CompareOpT,
+                                         KeyT,
+                                         ValueT>;
+
+  const OffsetT vshmem_offset = blockIdx.x *
+                                AgentBlockSortT::SHARED_MEMORY_SIZE;
+
+  typename AgentBlockSortT::TempStorage &storage =
+    *reinterpret_cast<typename AgentBlockSortT::TempStorage *>(
+      UseVShmem ? vshmem + vshmem_offset : shmem);
+
+  AgentBlockSortT agent(ping,
+                        storage,
+                        thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_inout),
+                        thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_inout),
+                        keys_count,
+                        keys_inout,
+                        items_inout,
+                        keys_out,
+                        items_out,
+                        compare_op);
+
+  agent.Process();
+}
+
+template <typename KeyIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT>
+__global__ void DeviceMergeSortPartitionKernel(bool ping,
+                                               KeyIteratorT keys_ping,
+                                               KeyT *keys_pong,
+                                               OffsetT keys_count,
+                                               OffsetT num_partitions,
+                                               OffsetT *merge_partitions,
+                                               CompareOpT compare_op,
+                                               OffsetT coop,
+                                               int items_per_tile)
+{
+  OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (partition_idx < num_partitions)
+  {
+    AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT> agent(
+      ping,
+      keys_ping,
+      keys_pong,
+      keys_count,
+      partition_idx,
+      merge_partitions,
+      compare_op,
+      coop,
+      items_per_tile);
+
+    agent.Process();
+  }
+}
+
+template <
+  bool UseVShmem,
+  typename ActivePolicyT,
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+void __global__ __launch_bounds__(ActivePolicyT::BLOCK_THREADS)
+DeviceMergeSortMergeKernel(bool ping,
+                           KeyIteratorT keys_ping,
+                           ValueIteratorT items_ping,
+                           OffsetT keys_count,
+                           KeyT *keys_pong,
+                           ValueT *items_pong,
+                           CompareOpT compare_op,
+                           OffsetT *merge_partitions,
+                           OffsetT coop,
+                           char *vshmem
+  )
+{
+  extern __shared__ char shmem[];
+
+  using AgentMergeT = AgentMerge<ActivePolicyT,
+                                 KeyIteratorT,
+                                 ValueIteratorT,
+                                 OffsetT,
+                                 CompareOpT,
+                                 KeyT,
+                                 ValueT>;
+
+  const OffsetT vshmem_offset = blockIdx.x * AgentMergeT::SHARED_MEMORY_SIZE;
+
+  typename AgentMergeT::TempStorage &storage =
+    *reinterpret_cast<typename AgentMergeT::TempStorage *>(
+      UseVShmem ? vshmem + vshmem_offset : shmem);
+
+  AgentMergeT agent(
+    ping,
+    storage,
+    thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
+    thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
+    thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
+    thrust::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    keys_count,
+    keys_pong,
+    items_pong,
+    keys_ping,
+    items_ping,
+    compare_op,
+    merge_partitions,
+    coop);
+
+  agent.Process();
+}
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename T>
+constexpr __device__ __host__ int
+get_items_per_thread(int nominal_4b_items_per_thread)
+{
+  return cub::min<int>(
+    nominal_4b_items_per_thread,
+    cub::max<int>(1, (nominal_4b_items_per_thread * 4 / sizeof(T))));
+}
+
+template <typename KeyIteratorT>
+struct DeviceMergeSortPolicy
+{
+  using KeyT = typename std::iterator_traits<KeyIteratorT>::value_type;
+
+  //------------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //------------------------------------------------------------------------------
+
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<128,
+                           get_items_per_thread<KeyT>(7),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           get_items_per_thread<KeyT>(11),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<512,
+                           get_items_per_thread<KeyT>(15),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           get_items_per_thread<KeyT>(17),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+
+  /// MaxPolicy
+  using MaxPolicy = Policy600;
+};
+
+template <
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename MergePolicyT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+struct BlockSortLauncher
+{
+  OffsetT num_tiles;
+  int block_sort_shmem_size;
+  bool ping;
+
+  KeyIteratorT d_keys;
+  ValueIteratorT d_items;
+  OffsetT num_items;
+  CompareOpT compare_op;
+  cudaStream_t stream;
+
+  KeyT *keys_buffer;
+  ValueT *items_buffer;
+  char* vshmem_ptr;
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  BlockSortLauncher(OffsetT num_tiles,
+                    int block_sort_shmem_size,
+                    bool ping,
+                    KeyIteratorT d_keys,
+                    ValueIteratorT d_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream,
+                    KeyT *keys_buffer,
+                    ValueT *items_buffer,
+                    char *vshmem_ptr)
+      : num_tiles(num_tiles)
+      , block_sort_shmem_size(block_sort_shmem_size)
+      , ping(ping)
+      , d_keys(d_keys)
+      , d_items(d_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , keys_buffer(keys_buffer)
+      , items_buffer(items_buffer)
+      , vshmem_ptr(vshmem_ptr)
+  {}
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  void launch() const
+  {
+    if (vshmem_ptr)
+    {
+      launch_impl<true>();
+    }
+    else
+    {
+      launch_impl<false>();
+    }
+  }
+
+  template <bool UseVShmem>
+  CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl() const
+  {
+    thrust::cuda_cub::launcher::triple_chevron(num_tiles,
+                                               MergePolicyT::BLOCK_THREADS,
+                                               block_sort_shmem_size,
+                                               stream)
+      .doit(DeviceMergeSortBlockSortKernel<UseVShmem,
+                                           MergePolicyT,
+                                           KeyIteratorT,
+                                           ValueIteratorT,
+                                           OffsetT,
+                                           CompareOpT,
+                                           KeyT,
+                                           ValueT>,
+            ping,
+            d_keys,
+            d_items,
+            num_items,
+            keys_buffer,
+            items_buffer,
+            compare_op,
+            vshmem_ptr);
+  }
+};
+
+template <
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename MergePolicyT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+struct MergeLauncher
+{
+  OffsetT num_tiles;
+  int merge_shmem_size;
+
+  KeyIteratorT d_keys;
+  ValueIteratorT d_items;
+  OffsetT num_items;
+  CompareOpT compare_op;
+  OffsetT *merge_partitions;
+  cudaStream_t stream;
+
+  KeyT *keys_buffer;
+  ValueT *items_buffer;
+  char *vshmem_ptr;
+
+  CUB_RUNTIME_FUNCTION __forceinline__ MergeLauncher(OffsetT num_tiles,
+                                                     int merge_shmem_size,
+                                                     KeyIteratorT d_keys,
+                                                     ValueIteratorT d_items,
+                                                     OffsetT num_items,
+                                                     CompareOpT compare_op,
+                                                     OffsetT *merge_partitions,
+                                                     cudaStream_t stream,
+                                                     KeyT *keys_buffer,
+                                                     ValueT *items_buffer,
+                                                     char *vshmem_ptr)
+      : num_tiles(num_tiles)
+      , merge_shmem_size(merge_shmem_size)
+      , d_keys(d_keys)
+      , d_items(d_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , merge_partitions(merge_partitions)
+      , stream(stream)
+      , keys_buffer(keys_buffer)
+      , items_buffer(items_buffer)
+      , vshmem_ptr(vshmem_ptr)
+  {}
+
+  CUB_RUNTIME_FUNCTION __forceinline__ void launch(bool ping,
+                                                   OffsetT coop) const
+  {
+    if (vshmem_ptr)
+    {
+      launch_impl<true>(ping, coop);
+    }
+    else
+    {
+      launch_impl<false>(ping, coop);
+    }
+  }
+
+  template <bool UseVShmem>
+  CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl(bool ping,
+                                                        OffsetT coop) const
+  {
+    thrust::cuda_cub::launcher::triple_chevron(num_tiles,
+                                               MergePolicyT::BLOCK_THREADS,
+                                               merge_shmem_size,
+                                               stream)
+      .doit(DeviceMergeSortMergeKernel<UseVShmem,
+                                       MergePolicyT,
+                                       KeyIteratorT,
+                                       ValueIteratorT,
+                                       OffsetT,
+                                       CompareOpT,
+                                       KeyT,
+                                       ValueT>,
+            ping,
+            d_keys,
+            d_items,
+            num_items,
+            keys_buffer,
+            items_buffer,
+            compare_op,
+            merge_partitions,
+            coop,
+            vshmem_ptr);
+  }
+};
+
+template <typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename SelectedPolicy =
+  DeviceMergeSortPolicy<KeyIteratorT>>
+struct DispatchMergeSort : SelectedPolicy
+{
+  using KeyT   = typename std::iterator_traits<KeyIteratorT>::value_type;
+  using ValueT = typename std::iterator_traits<ValueIteratorT>::value_type;
+
+  // Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+  //------------------------------------------------------------------------------
+  // Problem state
+  //------------------------------------------------------------------------------
+
+  void *d_temp_storage;         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+  size_t &temp_storage_bytes;   ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+  KeyIteratorT d_keys;          ///< [in,out] Pointer to the input sequence of unsorted input keys
+  ValueIteratorT d_items;       ///< [in,out] Pointer to the input sequence of unsorted input values
+  OffsetT num_items;            ///< [in] Number of items to sort
+  CompareOpT compare_op;        ///< [in] Comparison function object which returns true if the first argument is ordered before the second
+  cudaStream_t stream;          ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  bool debug_synchronous;       ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+  int ptx_version;
+
+  //------------------------------------------------------------------------------
+  // Constructor
+  //------------------------------------------------------------------------------
+
+  CUB_RUNTIME_FUNCTION __forceinline__ size_t
+  vshmem_size(size_t max_shmem, size_t shmem_per_block, size_t num_blocks)
+  {
+    if (shmem_per_block > max_shmem)
+    {
+      return shmem_per_block * num_blocks;
+    }
+    else
+    {
+      return 0;
+    }
+  }
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION __forceinline__
+  DispatchMergeSort(void *d_temp_storage,
+                    size_t &temp_storage_bytes,
+                    KeyIteratorT d_keys,
+                    ValueIteratorT d_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream,
+                    bool debug_synchronous,
+                    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_items(d_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+      , ptx_version(ptx_version)
+  {}
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    using MergePolicyT = typename ActivePolicyT::MergeSortPolicy;
+
+    using BlockSortAgentT = AgentBlockSort<MergePolicyT,
+                                           KeyIteratorT,
+                                           ValueIteratorT,
+                                           OffsetT,
+                                           CompareOpT,
+                                           KeyT,
+                                           ValueT>;
+
+    using MergeAgentT = AgentMerge<MergePolicyT,
+                                   KeyIteratorT,
+                                   ValueIteratorT,
+                                   OffsetT,
+                                   CompareOpT,
+                                   KeyT,
+                                   ValueT>;
+
+    cudaError error = cudaSuccess;
+
+    if (num_items == 0)
+      return error;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal = 0;
+      if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+      // Get shared memory size
+      int max_shmem = 0;
+      if (CubDebug(error = cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_ordinal))) break;
+
+      int tile_size = MergePolicyT::ITEMS_PER_TILE;
+      OffsetT num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+      size_t block_sort_shmem_size = BlockSortAgentT::SHARED_MEMORY_SIZE;
+      size_t merge_shmem_size = MergeAgentT::SHARED_MEMORY_SIZE;
+
+      size_t temp_storage1 = (1 + num_tiles) * sizeof(OffsetT);
+      size_t temp_storage2 = num_items * sizeof(KeyT);
+      size_t temp_storage3 = num_items * sizeof(ValueT) * !KEYS_ONLY;
+      size_t temp_storage4 =
+        vshmem_size(max_shmem,
+                    cub::max<size_t>(block_sort_shmem_size, merge_shmem_size),
+                    num_tiles);
+
+      void*  allocations[4]      = {nullptr, nullptr, nullptr, nullptr};
+      size_t allocation_sizes[4] = {temp_storage1, temp_storage2, temp_storage3, temp_storage4};
+
+      if (CubDebug(error = AliasTemporaries(d_temp_storage,
+                                            temp_storage_bytes,
+                                            allocations,
+                                            allocation_sizes)))
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      int num_passes = static_cast<int>(thrust::detail::log2_ri(num_tiles));
+      bool ping = !(1 & num_passes);
+
+      auto merge_partitions = (OffsetT *)allocations[0];
+      auto keys_buffer         = (KeyT *)allocations[1];
+      auto items_buffer      = (ValueT *)allocations[2];
+
+      char* vshmem_ptr = temp_storage4 > 0 ? (char*)allocations[3] : nullptr;
+
+      // Invoke DeviceReduceKernel
+      BlockSortLauncher<KeyIteratorT,
+                        ValueIteratorT,
+                        OffsetT,
+                        MergePolicyT,
+                        CompareOpT,
+                        KeyT,
+                        ValueT>
+        block_sort_launcher(num_tiles,
+                            vshmem_ptr ? 0 : block_sort_shmem_size,
+                            ping,
+                            d_keys,
+                            d_items,
+                            num_items,
+                            compare_op,
+                            stream,
+                            keys_buffer,
+                            items_buffer,
+                            vshmem_ptr);
+
+      block_sort_launcher.launch();
+
+      if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError())) break;
+
+      size_t num_partitions = num_tiles + 1;
+      const int threads_per_partition_block = 256;
+      const size_t partition_grid_size =
+        cub::DivideAndRoundUp(num_partitions, threads_per_partition_block);
+
+      MergeLauncher<KeyIteratorT,
+                    ValueIteratorT,
+                    OffsetT,
+                    MergePolicyT,
+                    CompareOpT,
+                    KeyT,
+                    ValueT>
+        merge_launcher(num_tiles,
+                       vshmem_ptr ? 0 : merge_shmem_size,
+                       d_keys,
+                       d_items,
+                       num_items,
+                       compare_op,
+                       merge_partitions,
+                       stream,
+                       keys_buffer,
+                       items_buffer,
+                       vshmem_ptr);
+
+      for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
+      {
+        OffsetT coop = OffsetT(2) << pass;
+
+        // Partition
+        thrust::cuda_cub::launcher::triple_chevron(partition_grid_size,
+                                                   threads_per_partition_block,
+                                                   0,
+                                                   stream)
+          .doit(DeviceMergeSortPartitionKernel<KeyIteratorT,
+                                               OffsetT,
+                                               CompareOpT,
+                                               KeyT>,
+                ping,
+                d_keys,
+                keys_buffer,
+                num_items,
+                num_partitions,
+                merge_partitions,
+                compare_op,
+                coop,
+                tile_size);
+
+        if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError())) break;
+
+        // Merge
+        merge_launcher.launch(ping, coop);
+
+        if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError())) break;
+      }
+    }
+    while (0);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           KeyIteratorT d_keys,
+           ValueIteratorT d_items,
+           OffsetT num_items,
+           CompareOpT compare_op,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    using MaxPolicyT = typename DispatchMergeSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+        break;
+
+      // Create dispatch functor
+      DispatchMergeSort dispatch(d_temp_storage,
+                                 temp_storage_bytes,
+                                 d_keys,
+                                 d_items,
+                                 num_items,
+                                 compare_op,
+                                 stream,
+                                 debug_synchronous,
+                                 ptx_version);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+        break;
+    } while (0);
+
+    return error;
+  }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cub/util_macro.cuh b/cub/util_macro.cuh
index ff86365422..eb903056db 100644
--- a/cub/util_macro.cuh
+++ b/cub/util_macro.cuh
@@ -56,6 +56,22 @@ namespace cub {
     #endif
 #endif
 
+#define CUB_PREVENT_MACRO_SUBSTITUTION
+
+template <typename T>
+constexpr __host__ __device__ T min CUB_PREVENT_MACRO_SUBSTITUTION(const T &lhs,
+                                                                   const T &rhs)
+{
+  return rhs < lhs ? rhs : lhs;
+}
+
+template <typename T>
+constexpr __host__ __device__ T max CUB_PREVENT_MACRO_SUBSTITUTION(const T &lhs,
+                                                                   const T &rhs)
+{
+  return rhs < lhs ? lhs : rhs;
+}
+
 #ifndef CUB_MAX
     /// Select maximum(a, b)
     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
diff --git a/test/test_block_merge_sort.cu b/test/test_block_merge_sort.cu
new file mode 100644
index 0000000000..e0f3ad1c3c
--- /dev/null
+++ b/test/test_block_merge_sort.cu
@@ -0,0 +1,270 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+#include <memory>
+
+#include <cub/util_allocator.cuh>
+#include <cub/block/block_merge_sort.cuh>
+
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <thrust/device_vector.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+struct CustomLess
+{
+  template <typename DataType>
+  __device__ bool operator()(DataType &lhs, DataType &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <
+  typename DataType,
+  unsigned int ThreadsInBlock,
+  unsigned int ItemsPerThread>
+__global__ void BlockMergeSortTestKernel(DataType *data)
+{
+  typedef cub::BlockMergeSort<DataType, ThreadsInBlock, ItemsPerThread>
+    BlockMergeSort;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  DataType thread_data[ItemsPerThread];
+
+  data += threadIdx.x * ItemsPerThread;
+  for (unsigned int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_data[item] = data[item];
+  }
+  __syncthreads();
+
+  BlockMergeSort(temp_storage_shuffle).Sort(
+    thread_data,
+    CustomLess());
+
+  for (unsigned int item = 0; item < ItemsPerThread; item++)
+  {
+    data[item] = thread_data[item];
+  }
+}
+
+template <
+  typename DataType,
+  unsigned int ThreadsInBlock,
+  unsigned int ItemsPerThread>
+__global__ void BlockMergeSortTestKernel(DataType *keys, DataType *values)
+{
+  typedef cub::BlockMergeSort<DataType, ThreadsInBlock, ItemsPerThread, DataType>
+    BlockMergeSort;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  DataType thread_keys[ItemsPerThread];
+  DataType thread_values[ItemsPerThread];
+
+  keys += threadIdx.x * ItemsPerThread;
+  values += threadIdx.x * ItemsPerThread;
+  
+  for (unsigned int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_keys[item] = keys[item];
+    thread_values[item] = values[item];
+  }
+  __syncthreads();
+
+  BlockMergeSort(temp_storage_shuffle).Sort(
+    thread_keys,
+    thread_values,
+    CustomLess());
+
+  for (unsigned int item = 0; item < ItemsPerThread; item++)
+  {
+    keys[item] = thread_keys[item];
+    values[item] = thread_values[item];
+  }
+}
+
+template<
+  typename DataType,
+  unsigned int ItemsPerThread,
+  unsigned int ThreadsInBlock>
+void BlockMergeSortTest(DataType *data)
+{
+  BlockMergeSortTestKernel<DataType, ThreadsInBlock, ItemsPerThread>
+    <<<1, ThreadsInBlock>>>(data);
+
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+}
+
+template<
+  typename DataType,
+  unsigned int ItemsPerThread,
+  unsigned int ThreadsInBlock>
+void BlockMergeSortTest(DataType *keys, DataType *values)
+{
+  BlockMergeSortTestKernel<DataType, ThreadsInBlock, ItemsPerThread>
+    <<<1, ThreadsInBlock>>>(keys, values);
+
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+}
+
+template <typename DataType>
+int CheckResult(int num_items,
+                thrust::device_vector<DataType> &d_data,
+                thrust::host_vector<DataType> &h_data)
+{
+  thrust::copy_n(d_data.begin(), num_items, h_data.begin());
+
+  for (int i = 0; i < num_items; i++)
+  {
+    if (h_data[i] != i)
+    {
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+template <
+  typename DataType,
+  unsigned int ItemsPerThread,
+  unsigned int ThreadsInBlock>
+void Test(unsigned int num_items,
+          thrust::default_random_engine &rng,
+          thrust::device_vector<DataType> &d_data,
+          thrust::host_vector<DataType> &h_data)
+{
+  thrust::sequence(d_data.begin(), d_data.end());
+  thrust::shuffle(d_data.begin(), d_data.end(), rng);
+
+  BlockMergeSortTest<DataType, ItemsPerThread, ThreadsInBlock>(
+      thrust::raw_pointer_cast(d_data.data()));
+
+  AssertEquals(0, CheckResult(num_items, d_data, h_data));
+}
+
+template <
+  typename DataType,
+  unsigned int ItemsPerThread,
+  unsigned int ThreadsInBlock>
+void Test(unsigned int num_items,
+          thrust::default_random_engine &rng,
+          thrust::device_vector<DataType> &d_keys,
+          thrust::device_vector<DataType> &d_values,
+          thrust::host_vector<DataType> &h_data)
+{
+  thrust::sequence(d_keys.begin(), d_keys.end());
+  thrust::shuffle(d_keys.begin(), d_keys.end(), rng);
+  thrust::copy_n(d_keys.begin(), num_items, d_values.begin());
+
+  BlockMergeSortTest<DataType, ItemsPerThread, ThreadsInBlock>(
+      thrust::raw_pointer_cast(d_keys.data()),
+      thrust::raw_pointer_cast(d_values.data()));
+
+  AssertEquals(0, CheckResult(num_items, d_values, h_data));
+}
+
+template <
+  typename DataType,
+  unsigned int ItemsPerThread,
+  unsigned int ThreadsInBlock>
+void Test(thrust::default_random_engine &rng)
+{
+  const unsigned int num_items = ItemsPerThread * ThreadsInBlock;
+
+  thrust::device_vector<DataType> d_keys(num_items);
+  thrust::device_vector<DataType> d_values(num_items);
+  thrust::device_vector<DataType> d_data(num_items);
+  thrust::host_vector<DataType> h_data(num_items);
+
+  Test<DataType, ItemsPerThread, ThreadsInBlock>(num_items,
+                                                 rng, 
+                                                 d_keys, 
+                                                 h_data);
+
+  Test<DataType, ItemsPerThread, ThreadsInBlock>(num_items,
+                                                 rng, 
+                                                 d_keys, 
+                                                 d_values, 
+                                                 h_data);
+}
+
+template <unsigned int ItemsPerThread, unsigned int ThreadsPerBlock>
+void Test(thrust::default_random_engine &rng)
+{
+  Test<int32_t, ItemsPerThread, ThreadsPerBlock>(rng);
+  Test<int64_t, ItemsPerThread, ThreadsPerBlock>(rng);
+}
+
+template <unsigned int ItemsPerThread>
+void Test(thrust::default_random_engine &rng)
+{
+  Test<ItemsPerThread, 32>(rng);
+  Test<ItemsPerThread, 256>(rng);
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  thrust::default_random_engine rng;
+
+  Test<1>(rng);
+  Test<2>(rng);
+  Test<10>(rng);
+  Test<15>(rng);
+
+  Test<int32_t, 1, 512>(rng);
+  Test<int64_t, 2, 512>(rng);
+
+  return 0;
+}
diff --git a/test/test_device_merge_sort.cu b/test/test_device_merge_sort.cu
new file mode 100644
index 0000000000..9ec9b31ed3
--- /dev/null
+++ b/test/test_device_merge_sort.cu
@@ -0,0 +1,351 @@
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+#include <memory>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_merge_sort.cuh>
+
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+struct HugeDataType
+{
+  enum
+  {
+    ELEMENTS_PER_OBJECT = 128
+  };
+
+  __device__ __host__ HugeDataType()
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = 0;
+    }
+  }
+
+  __device__ __host__ HugeDataType(const HugeDataType&rhs)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = rhs.data[i];
+    }
+  }
+
+  explicit __device__ __host__ HugeDataType(int val)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = val;
+    }
+  }
+
+  int data[ELEMENTS_PER_OBJECT];
+};
+
+__device__ __host__ bool operator<(const HugeDataType &lhs,
+                                   const HugeDataType &rhs)
+{
+  for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] < rhs.data[i])
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <typename DataType>
+__device__ __host__ bool operator!=(const HugeDataType &lhs,
+                                    const DataType &rhs)
+{
+  for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] != rhs)
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+struct CustomLess
+{
+  template <typename DataType>
+  __device__ bool operator()(DataType &lhs, DataType &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <typename DataType>
+int CheckResult(thrust::device_vector<DataType> &d_data)
+{
+  const bool is_sorted = thrust::is_sorted(d_data.begin(), d_data.end(), CustomLess());
+  return !is_sorted;
+}
+
+template <typename KeyType, typename ValueType>
+struct ValueToKey
+{
+  __device__ __host__ KeyType operator()(const ValueType &val)
+  {
+    return val;
+  }
+};
+
+template <typename ValueType>
+struct ValueToKey<HugeDataType, ValueType>
+{
+  __device__ __host__ HugeDataType operator()(const ValueType &val)
+  {
+    return HugeDataType(val);
+  }
+};
+
+template <typename KeyType, typename DataType>
+void Test(int64_t num_items,
+          thrust::default_random_engine &rng,
+          thrust::device_vector<KeyType> &d_keys,
+          thrust::device_vector<DataType> &d_values)
+{
+  thrust::sequence(d_values.begin(), d_values.end());
+  thrust::shuffle(d_values.begin(), d_values.end(), rng);
+
+  thrust::transform(d_values.begin(),
+                    d_values.end(),
+                    d_keys.begin(),
+                    ValueToKey<KeyType, DataType>());
+
+  size_t temp_size = 0;
+  CubDebugExit(cub::DeviceMergeSort::SortPairs(
+      nullptr,
+      temp_size,
+      thrust::raw_pointer_cast(d_keys.data()),
+      thrust::raw_pointer_cast(d_values.data()),
+      num_items,
+      CustomLess(),
+      0,
+      true));
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  CubDebugExit(cub::DeviceMergeSort::SortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess(),
+    0,
+    true));
+
+  AssertEquals(0, CheckResult(d_values));
+
+  // At the moment stable sort is an alias to sort, so it's safe to use
+  // temp_size storage allocated before
+  CubDebugExit(cub::DeviceMergeSort::StableSortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess(),
+    0,
+    true));
+
+  AssertEquals(0, CheckResult(d_values));
+}
+
+template <typename KeyType, typename DataType>
+void TestKeys(int64_t num_items,
+              thrust::default_random_engine &rng,
+              thrust::device_vector<KeyType> &d_keys,
+              thrust::device_vector<DataType> &d_values)
+{
+  thrust::sequence(d_values.begin(), d_values.end());
+  thrust::shuffle(d_values.begin(), d_values.end(), rng);
+
+  thrust::transform(d_values.begin(),
+                    d_values.end(),
+                    d_keys.begin(),
+                    ValueToKey<KeyType, DataType>());
+
+  size_t temp_size = 0;
+  cub::DeviceMergeSort::SortKeys(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess());
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  CubDebugExit(cub::DeviceMergeSort::SortKeys(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess(),
+    0,
+    true));
+
+  AssertEquals(0, CheckResult(d_keys));
+
+  // At the moment stable sort is an alias to sort, so it's safe to use
+  // temp_size storage allocated before
+  CubDebugExit(cub::DeviceMergeSort::StableSortKeys(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess(),
+    0,
+    true));
+
+  AssertEquals(0, CheckResult(d_keys));
+}
+
+template <bool data_dont_exceed_key_size>
+struct TestHelper
+{
+  template <typename KeyType, typename DataType>
+  static void AllocateAndTest(thrust::default_random_engine &rng, unsigned int num_items)
+  {
+    thrust::device_vector<KeyType> d_keys(num_items);
+    thrust::device_vector<DataType> d_values(num_items);
+    thrust::device_vector<DataType> d_data(num_items);
+
+    Test<KeyType, DataType>(num_items, rng, d_keys, d_values);
+    TestKeys<KeyType, DataType>(num_items, rng, d_keys, d_values);
+  }
+};
+
+template <>
+struct TestHelper<false>
+{
+  template <typename, typename>
+  static void AllocateAndTest(thrust::default_random_engine &, unsigned int)
+  {}
+};
+
+template <typename DataType>
+void Test(thrust::default_random_engine &rng, unsigned int num_items)
+{
+  TestHelper<sizeof(DataType) <= sizeof(uint8_t) >::template AllocateAndTest<uint8_t,  DataType>(rng, num_items);
+  TestHelper<sizeof(DataType) <= sizeof(uint32_t)>::template AllocateAndTest<uint32_t, DataType>(rng, num_items);
+  TestHelper<sizeof(DataType) <= sizeof(uint64_t)>::template AllocateAndTest<uint64_t, DataType>(rng, num_items);
+}
+
+template <typename KeyType, typename DataType>
+void AllocateAndTestIterators(unsigned int num_items)
+{
+  thrust::device_vector<KeyType> d_keys(num_items);
+  thrust::device_vector<DataType> d_values(num_items);
+  thrust::device_vector<DataType> d_data(num_items);
+
+  thrust::host_vector<DataType> h_values(num_items);
+
+  thrust::sequence(d_keys.begin(), d_keys.end());
+  thrust::sequence(d_values.begin(), d_values.end());
+
+  thrust::reverse(d_values.begin(), d_values.end());
+
+  typedef typename thrust::device_vector<KeyType>::iterator KeyIterator;
+
+  size_t temp_size = 0;
+  cub::DeviceMergeSort::SortPairs(
+    nullptr,
+    temp_size,
+    thrust::reverse_iterator<KeyIterator>(d_keys.end()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess());
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  cub::DeviceMergeSort::SortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::reverse_iterator<KeyIterator>(d_keys.end()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess());
+
+  AssertEquals(0, CheckResult(d_values));
+}
+
+template <typename DataType>
+void Test(thrust::default_random_engine &rng)
+{
+  for (int pow2 = 9; pow2 < 22; pow2 += 2)
+  {
+    const int num_items = std::pow(2, pow2);
+    AllocateAndTestIterators<DataType, DataType>(num_items);
+    TestHelper<true>::AllocateAndTest<HugeDataType, DataType>(rng, num_items);
+    Test<DataType>(rng, num_items);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  thrust::default_random_engine rng;
+
+  Test<int32_t>(rng);
+  Test<int64_t>(rng);
+
+  return 0;
+}