diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
index c4025dd42d4a6..231170212a37b 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
@@ -42,7 +42,7 @@ namespace cms::alpakatools {
       }
     }
 
-    inline constexpr T cont& back() const {
+    inline constexpr T const &back() const {
       if (m_size > 0) {
         return m_data[m_size - 1];
       } else
diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
index bcf1da16a5533..0f94ad200efd9 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
@@ -9,6 +9,7 @@
 #include <alpaka/alpaka.hpp>
 
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 namespace cms::alpakatools {
 
@@ -26,20 +27,27 @@ namespace cms::alpakatools {
     alpaka::syncBlockThreads(acc);
 
     // find first negative
-    for(auto idx: elements_with_stride(acc, size - 1)) {
-      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
+    for (auto idx : independent_group_elements(acc, size - 1)) {
+      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) {
         firstNeg = idx + 1;
+      }
     }
 
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg, size)) {
+      ind2[idx - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg)) {
+      ind2[idx + size - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+    for (auto idx : independent_group_elements(acc, size)) {
+      ind[idx] = ind2[idx];
+    }
   }
 
   template <typename TAcc, typename T>
@@ -52,51 +60,56 @@ namespace cms::alpakatools {
     alpaka::syncBlockThreads(acc);
 
     // find first negative
-    for(auto idx: elements_with_stride(acc, size - 1)) {
+    for (auto idx : independent_group_elements(acc, size - 1)) {
       if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
         firstNeg = idx + 1;
     }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg, size)) {
+      ind2[size - idx - 1] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg)) {
+      ind2[idx + size - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+    for (auto idx : independent_group_elements(acc, size)) {
+      ind[idx] = ind2[idx];
+    }
   }
 
-  
   // Radix sort implements a bytewise lexicographic order on the input data.
   // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first
   // and respecting the existing order when binning the values, we achieve the lexicographic ordering.
   // The number of bytes actually considered is a parameter template parameter.
   // The post processing reorder
-  // function fixes the order when bitwise ordering is not the order for the underlying type (only based on 
+  // function fixes the order when bitwise ordering is not the order for the underlying type (only based on
   // most significant bit for signed types, integer or floating point).
   // The floating point numbers are reinterpret_cast into integers in the calling wrapper
   // This algorithm requires to run in a single block
   template <typename TAcc,
-            typename T,  // shall be integer, signed or not does not matter here
-            int NS,      // number of significant bytes to use in sorting. 
-            typename RF> // The post processing reorder function.
+            typename T,   // shall be integer, signed or not does not matter here
+            int NS,       // number of significant bytes to use in sorting.
+            typename RF>  // The post processing reorder function.
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl(
       const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
     if constexpr (!requires_single_thread_per_block_v<TAcc>) {
       const auto warpSize = alpaka::warp::getSize(acc);
       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+      [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
       // we expect a power of 2 here
       assert(warpSize && (0 == (warpSize & (warpSize - 1))));
-      const std::size_t warpMask  = warpSize - 1;
+      const std::size_t warpMask = warpSize - 1;
 
       // Define the bin size (d=8 => 1 byte bin).
       constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits;
       // Make sure the slices are data aligned
-      static_assert (0 == dataBits % binBits);
+      static_assert(0 == dataBits % binBits);
       // Make sure the NS parameter makes sense
-      static_assert (NS > 0 && NS <= sizeof(T));
+      static_assert(NS > 0 && NS <= sizeof(T));
       constexpr int binsNumber = 1 << binBits;
       constexpr int binsMask = binsNumber - 1;
       // Prefix scan iterations. NS is counted in full bytes and not slices.
@@ -106,7 +119,7 @@ namespace cms::alpakatools {
       // TODO: rename
       auto& c = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // Temporary storage for prefix scan. Only really needed for first-of-warp keeping
-      // Then used for thread to bin mapping TODO: change type to byte and remap to 
+      // Then used for thread to bin mapping TODO: change type to byte and remap to
       auto& ct = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // Bin to thread index mapping (used to store the highest thread index within a bin number
       // batch of threads.
@@ -114,7 +127,7 @@ namespace cms::alpakatools {
       // lowest possible value (change to bytes?)
       auto& cu = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // TODO we could also have an explicit caching of the current index for each thread.
-      
+
       // TODO: do those have to be shared?
       auto& ibs = alpaka::declareSharedVar<int, __COUNTER__>(acc);
       auto& currentSortingPass = alpaka::declareSharedVar<int, __COUNTER__>(acc);
@@ -129,36 +142,40 @@ namespace cms::alpakatools {
       auto k = ind2;
 
       // Initializer index order to trivial increment.
-      for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; }
+      for (auto idx : independent_group_elements(acc, size)) {
+        j[idx] = idx;
+      }
       alpaka::syncBlockThreads(acc);
 
       // Iterate on the slices of the data.
       while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
-        for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; }
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
+          c[idx] = 0;
+        }
         alpaka::syncBlockThreads(acc);
         const auto sortingPassShift = binBits * currentSortingPass;
 
         // fill bins (count elements in each bin)
-        for (auto idx: independent_group_elements(acc, size)) {
+        for (auto idx : independent_group_elements(acc, size)) {
           auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
           alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
-        } 
+        }
         alpaka::syncBlockThreads(acc);
-        
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d ", currentSortingPass - 1);
+
+        if (!threadIdxLocal && 1 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
+          //          printf("Pass=%d, Block=%d, ", currentSortingPass - 1, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]);
           size_t total = 0;
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("count[%d]=%d ", i, c[i] );
+          for (int i = 0; i < (int)binsNumber; i++) {
+            //            printf("count[%d]=%d ", i, c[i] );
             total += c[i];
           }
-          printf("total=%zu\n", total);
+          //          printf("total=%zu\n", total);
           assert(total == size);
         }
         // prefix scan "optimized"???...
         // TODO: we might be able to reuse the warpPrefixScan function
         // Warp level prefix scan
-        for (auto idx: independent_group_elements(acc, binsNumber)) {
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
           auto x = c[idx];
           auto laneId = idx & warpMask;
 
@@ -170,16 +187,16 @@ namespace cms::alpakatools {
           ct[idx] = x;
         }
         alpaka::syncBlockThreads(acc);
-        
+
         // Block level completion of prefix scan (add last sum of each preceding warp)
-        for (auto idx: independent_group_elements(acc, binsNumber)) {
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
           auto ss = (idx / warpSize) * warpSize - 1;
           c[idx] = ct[idx];
           for (int i = ss; i > 0; i -= warpSize)
             c[idx] += ct[i];
         }
         // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin
-        
+
         /*
         //prefix scan for the nulls  (for documentation)
         if (threadIdxLocal==0)
@@ -191,18 +208,11 @@ namespace cms::alpakatools {
         // This will reorder the indices by the currently considered slice, otherwise preserving the previous order.
         ibs = size - 1;
         alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d (post prefix scan) ", currentSortingPass - 1);
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("offset[%d]=%d ", i, c[i] );
-          }
-          printf("\n");
-        }
 
         // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations
         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
           // Init
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             cu[idx] = -1;
             ct[idx] = -1;
           }
@@ -210,7 +220,7 @@ namespace cms::alpakatools {
 
           // Find the highest index for all the threads dealing with a given bin (in cu[])
           // Also record the bin for each thread (in ct[])
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             int32_t bin = -1;
             if (i >= 0) {
@@ -220,24 +230,16 @@ namespace cms::alpakatools {
             }
           }
           alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d (max index) ", currentSortingPass - 1);
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("max_i[%d]=%d ", i, cu[i] );
-          }
-          printf("\n");
-        }
 
-          
           // FIXME: we can slash a memory access.
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             // Are we still in inside the data?
             if (i >= 0) {
               int32_t bin = ct[idx];
               // Are we the thread with the highest index (from previous pass)?
               if (cu[bin] == i) {
-                // With the highest index, we are actually the lowest thread number. We will 
+                // With the highest index, we are actually the lowest thread number. We will
                 // work "on behalf of" the higher thread numbers (including ourselves)
                 // No way around scanning and testing for bin in ct[otherThread] number to find the other threads
                 for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) {
@@ -292,21 +294,17 @@ namespace cms::alpakatools {
         if (threadIdxLocal == 0)
           ++currentSortingPass;
         alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && size == 257) {
-          printf("Pass=%d ", currentSortingPass - 1);
-          for(int i=0; i<(int)size; i++) {
-            printf("k[%d]=%d ", i, k[i] );
-          }
-          printf("\n");
-        }
       }
 
       if ((dataBits != 8) && (0 == (NS & 1)))
-        ALPAKA_ASSERT_OFFLOAD(j == ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
+        ALPAKA_ASSERT_OFFLOAD(j ==
+                              ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
 
       // TODO this copy is (doubly?) redundant with the reorder
       if (j != ind)  // odd number of sorting passes, we need to move the result to the right array (ind[])
-        for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; };
+        for (auto idx : independent_group_elements(acc, size)) {
+          ind[idx] = ind2[idx];
+        };
 
       alpaka::syncBlockThreads(acc);
 
@@ -321,41 +319,36 @@ namespace cms::alpakatools {
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* =
+                nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, dummyReorder<TAcc, T>);
   }
 
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value &&
+                                        !requires_single_thread_per_block_v<TAcc>,
+                                    T>::type* = nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, reorderSigned<TAcc, T>);
   }
 
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>,
+                                    T>::type* = nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
     static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size");
     using I = int;
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, I, NS>(acc, (I const*)(a), ind, ind2, size, reorderFloat<TAcc, I>);
   }
-  
+
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
@@ -376,7 +369,7 @@ namespace cms::alpakatools {
     }
     printf("\n");
     */
-    std::stable_sort(ind, ind+size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
+    std::stable_sort(ind, ind + size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
     /*
     for (uint32_t i=0; i<10 && i<size; i++) {
       printf ("ind[%d]=%d ", i, ind[i]);
@@ -384,17 +377,14 @@ namespace cms::alpakatools {
     printf("\n");
     */
   }
-  
-                                                                                                                                                                                   template <typename TAcc, typename T, int NS = sizeof(T)>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc,
-                                                 T const* v,
-                                                 uint16_t* index,
-                                                 uint32_t const* offsets,
-                                                 uint16_t* workspace) {
-    // TODO: check 
+
+  template <typename TAcc, typename T, int NS = sizeof(T)>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(
+      const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+    // TODO: check
     // Sort multiple blocks of data in v[] separated by in chunks located at offsets[]
     // extern __shared__ uint16_t ws[];
-    uint16_t * ws = alpaka::getDynSharedMem<uint16_t>(acc);
+    uint16_t* ws = alpaka::getDynSharedMem<uint16_t>(acc);
 
     const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
     auto a = v + offsets[blockIdx];
@@ -402,23 +392,24 @@ namespace cms::alpakatools {
     auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx];
     auto size = offsets[blockIdx + 1] - offsets[blockIdx];
     assert(offsets[blockIdx + 1] >= offsets[blockIdx]);
-    if (0 == alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]) {
-      printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", 
-              blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]);
-    }
     if (size > 0)
       radixSort<TAcc, T, NS>(acc, a, ind, ind2, size);
   }
- 
+
   template <typename T, int NS = sizeof(T)>
   struct radixSortMultiWrapper {
-/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical
+    /* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical (keeping CUDA/HIP code for reference for the moment)
 #if defined(__CUDACC__) || defined(__HIPCC__)
     //__global__ void __launch_bounds__(256, 4)
 #endif
 */
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  T const* v,
+                                  uint16_t* index,
+                                  uint32_t const* offsets,
+                                  uint16_t* workspace,
+                                  size_t sharedMemBytes = 0) const {
       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
     }
   };
@@ -426,10 +417,39 @@ namespace cms::alpakatools {
   template <typename T, int NS = sizeof(T)>
   struct radixSortMultiWrapper2 {
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  T const* v,
+                                  uint16_t* index,
+                                  uint32_t const* offsets,
+                                  uint16_t* workspace,
+                                  size_t sharedMemBytes = 0) const {
       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
     }
   };
 }  // namespace cms::alpakatools
 
+namespace alpaka::trait {
+  // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
+  // block shared dynamic memory for the radixSortMultiWrapper kernel
+  template <typename TAcc, typename T, int NS>
+  struct BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc> {
+    // the size in bytes of the shared memory allocated for a block
+    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
+        cms::alpakatools::radixSortMultiWrapper<T, NS> const& /* kernel */,
+        alpaka_common::Vec1D /* threads */,
+        alpaka_common::Vec1D /* elements */,
+        T const* /* v */,
+        uint16_t* /* index */,
+        uint32_t const* /* offsets */,
+        uint16_t* workspace,
+        size_t sharedMemBytes) {
+      if (workspace != nullptr)
+        return 0;
+      /* The shared memory workspace is 'blockspace * 2' in CUDA *but that's a value coincidence... TODO: check */
+      //printf ("in BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc>: shared mem size = %d\n", (int)sharedMemBytes);
+      return sharedMemBytes;
+    }
+  };
+}  // namespace alpaka::trait
+
 #endif  // HeterogeneousCore_AlpakaInterface_interface_radixSort_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 4c0aa9fe5f2b9..e02f4e92f813e 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -701,6 +701,12 @@ namespace cms::alpakatools {
           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}
 
+    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_ + first},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
     class const_iterator;
     using iterator = const_iterator;
 
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 463bf5dae4461..0198e36f9166f 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -58,7 +58,6 @@
 
 <bin name="alpakaTestSimpleVector" file="alpaka/testSimpleVector.dev.cc">
   <use name="alpaka"/>
-  <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
index f4073997f5c97..1687feb8c1bab 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
@@ -42,7 +42,12 @@ struct finalize {
 TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) {
   SECTION("AtomicPairCounter") {
     auto const &devices = cms::alpakatools::devices<Platform>();
-    REQUIRE(!devices.empty());
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      REQUIRE(not devices.empty());
+    }
+
     // run the test on each device
     for (auto const &device : devices) {
       std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n';
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
index 5e08c05725888..4ce11cc7facdd 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
@@ -93,7 +93,7 @@ struct mykernel {
 #ifndef NDEBUG
       auto b0 = Hist::bin(v[j]);
 #endif
-      int tot = 0;
+      [[maybe_unused]] int tot = 0;
       auto ftest = [&](unsigned int k) {
         ALPAKA_ASSERT_OFFLOAD(k < N);
         ++tot;
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
index c781f83ff1225..21f3477a3dd92 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
@@ -64,7 +64,8 @@ ALPAKA_FN_HOST_ACC T truncate(T const& t) {
 namespace {
   struct testKernel {
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
+    ALPAKA_FN_ACC void operator()(
+        const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
       //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x;  // This is going to be the track index
       //size_t gridSize = blockDim.x * gridDim.x;
       bool threadZero = !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
@@ -76,15 +77,15 @@ namespace {
       assert(0 == blocksIdx);
       assert(elements <= 2048);
 
-      auto &order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
-      auto &sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
-      auto &z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
-      auto &iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
-//      __shared__ uint16_t order[2048];
-//      __shared__ uint16_t sws[2048];
-//      __shared__ float z[2048];
-//      __shared__ int iz[2048];
-      for (auto itrack: elements_with_stride(acc, elements)) {
+      auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
+      auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
+      //      __shared__ uint16_t order[2048];
+      //      __shared__ uint16_t sws[2048];
+      //      __shared__ float z[2048];
+      //      __shared__ int iz[2048];
+      for (auto itrack : elements_with_stride(acc, elements)) {
         z[itrack] = gpu_input[itrack];
         iz[itrack] = 10000 * gpu_input[itrack];
         // order[itrack] = itrack;
@@ -94,7 +95,7 @@ namespace {
       alpaka::syncBlockThreads(acc);
 
       //verify
-      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : elements_with_stride(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
@@ -122,7 +123,7 @@ namespace {
       radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
       alpaka::syncBlockThreads(acc);
 
-      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : elements_with_stride(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(iz[ntrack] <= iz[mtrack]);
@@ -146,11 +147,12 @@ namespace {
     }
   };
 
-  void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
+  void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
     auto blockSize = 512;  // somewhat arbitrary
     auto gridSize = 1;     // round up to cover the sample size
     const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
-    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
     alpaka::wait(queue);
   }
 }  // namespace
@@ -165,50 +167,54 @@ int main() {
               << ", the test will be skipped.\n";
     return 0;
   }
-  
+
   // run the test on each device
   for (auto const& device : devices) {
     Queue queue(device);
-//    FLOAT* gpu_input;
-//    int* gpu_product;
+    //    FLOAT* gpu_input;
+    //    int* gpu_product;
 
     int nmax = 4 * 260;
     auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
     auto i = gpu_input_h.data();
-    for (auto v: {
-        30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621,
-        2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453,  2.2852313519,  2.2877883911, 2.2946476936, 2.2960267067,
-        2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652,  2.783257246,   2.8440306187, 2.9017834663, 2.9252648354,
-        2.9254128933, 2.927520752,  2.9422419071, 2.9453969002,  2.9457902908,  2.9465973377, 2.9492356777, 2.9573802948,
-        2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851,  2.9622797966,  2.9625515938, 2.9641008377, 2.9646151066,
-        2.9676523209, 2.9708273411, 2.974111557,  2.9742531776,  2.9772830009,  2.9877333641, 2.9960610867, 3.013969183,
-        3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934,  3.0470511913,  3.0560519695, 3.0592908859, 3.0599737167,
-        3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686, 3.068046093,
-        3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026,  3.0733406544,  3.0738227367, 3.0738801956, 3.0738828182,
-        3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146,  3.0773906708,  3.0778541565, 3.0780284405, 3.0780889988,
-        3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217,  3.0795567036,  3.0797944069, 3.0806643963, 3.0809247494,
-        3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398,  3.0838682652,  3.084009409,  3.0848178864, 3.0853257179,
-        3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759,  3.0891149044,  3.0893011093, 3.0895674229, 3.0901503563,
-        3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657,  3.096424818,   3.0995628834, 3.1001036167, 3.1173279285,
-        3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464, 3.3821125031,
-        3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872,  3.5225863457,  3.5271244049, 3.5298995972, 3.5417425632,
-        3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239,  3.5526945591,  3.5531234741, 3.5538012981, 3.5544877052,
-        3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172,  3.5560717583,  3.5560848713, 3.5584278107, 3.558681488,
-        3.5587313175, 3.5592217445, 3.559384346,  3.5604712963,  3.5634038448,  3.563803196,  3.564593792,  3.5660364628,
-        3.5683133602, 3.5696356297, 3.569729805,  3.5740811825,  3.5757565498,  3.5760207176, 3.5760478973, 3.5836098194,
-        3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575,  3.6601481438,  3.7187042236, 3.9741659164, 4.4111995697,
-        4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496, 4.6888813972,
-        4.6910686493, 4.6925001144, 4.6957530975, 4.698094368,   4.6997032166,  4.7017259598, 4.7020640373, 4.7024269104,
-        4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456,  4.7044372559,  4.7050771713, 4.7055773735, 4.7060651779,
-        4.7062759399, 4.7065420151, 4.70657444,   4.7066287994,  4.7066788673,  4.7067341805, 4.7072944641, 4.7074551582,
-        4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458,  4.7080879211,  4.70819664,   4.7086658478, 4.708937645,
-        4.7092385292, 4.709479332,  4.7095656395, 4.7100076675,  4.7102108002,  4.7104525566, 4.7105507851, 4.71118927,
-        4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671,  4.7117190361,  4.7117333412, 4.7117910385, 4.7119007111,
-        4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918, 4.7148418427,
-        4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611,  4.717815876,   4.718059063,  4.7188801765, 4.7190728188,
-        4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678,  4.7243990898,  4.7273659706, 4.7294125557, 4.7296204567,
-        4.7325615883, 4.7356877327, 4.740146637,  4.742254734,   4.7433848381,  4.7454957962, 4.7462964058, 4.7692604065,
-        4.7723139628, 4.774812736,  4.8577151299, 4.890037536}) {
+    for (auto v : {30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
+                   2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555,  2.2821085453,  2.2852313519, 2.2877883911,
+                   2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038,  2.6755006313,  2.7229132652, 2.783257246,
+                   2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933,  2.927520752,   2.9422419071, 2.9453969002,
+                   2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948,  2.9575133324,  2.9575304985, 2.9586606026,
+                   2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377,  2.9646151066,  2.9676523209, 2.9708273411,
+                   2.974111557,  2.9742531776, 2.9772830009, 2.9877333641,  2.9960610867,  3.013969183,  3.0187871456,
+                   3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913,  3.0560519695,  3.0592908859, 3.0599737167,
+                   3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686,
+                   3.068046093,  3.0697011948, 3.0717656612, 3.0718104839,  3.0718348026,  3.0733406544, 3.0738227367,
+                   3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264,  3.0758397579,  3.0767207146, 3.0773906708,
+                   3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721,  3.0789675713,  3.0792205334, 3.0793278217,
+                   3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494,  3.0815284252,  3.0817306042, 3.0819730759,
+                   3.0820026398, 3.0838682652, 3.084009409,  3.0848178864,  3.0853257179,  3.0855510235, 3.0856611729,
+                   3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093,  3.0895674229,  3.0901503563, 3.0903317928,
+                   3.0912668705, 3.0920717716, 3.0954346657, 3.096424818,   3.0995628834,  3.1001036167, 3.1173279285,
+                   3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464,
+                   3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616,  3.5076274872,  3.5225863457, 3.5271244049,
+                   3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587,  3.5473103523,  3.5480232239, 3.5526945591,
+                   3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996,  3.5549693108,  3.5550665855, 3.5558729172,
+                   3.5560717583, 3.5560848713, 3.5584278107, 3.558681488,   3.5587313175,  3.5592217445, 3.559384346,
+                   3.5604712963, 3.5634038448, 3.563803196,  3.564593792,   3.5660364628,  3.5683133602, 3.5696356297,
+                   3.569729805,  3.5740811825, 3.5757565498, 3.5760207176,  3.5760478973,  3.5836098194, 3.5839796066,
+                   3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438,  3.7187042236,  3.9741659164, 4.4111995697,
+                   4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496,
+                   4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975,  4.698094368,   4.6997032166, 4.7017259598,
+                   4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123,  4.7042069435,  4.7044086456, 4.7044372559,
+                   4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399,  4.7065420151,  4.70657444,   4.7066287994,
+                   4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582,  4.7075614929,  4.7075891495, 4.7076044083,
+                   4.7077374458, 4.7080879211, 4.70819664,   4.7086658478,  4.708937645,   4.7092385292, 4.709479332,
+                   4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566,  4.7105507851,  4.71118927,   4.7113513947,
+                   4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361,  4.7117333412,  4.7117910385, 4.7119007111,
+                   4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918,
+                   4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308,  4.7177276611,  4.717815876,  4.718059063,
+                   4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472,  4.7239775658,  4.7243933678, 4.7243990898,
+                   4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883,  4.7356877327,  4.740146637,  4.742254734,
+                   4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065,  4.7723139628,  4.774812736,  4.8577151299,
+                   4.890037536}) {
       *(i++) = v;
     }
     auto input = gpu_input_h.data();
@@ -219,7 +225,7 @@ int main() {
     }
     auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
     //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax));
-//    cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
+    //    cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
     auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
     // copy the input data to the GPU
     alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
index df41c649b91cb..9caf2af20eb9c 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
@@ -81,7 +81,7 @@ void truncate(T& t) {
 }
 
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
-void go(Queue & queue, bool useShared) {
+void go(Queue& queue, bool useShared) {
   std::mt19937 eng;
   //std::mt19937 eng2;
   auto rgen = RS<T>::ud();
@@ -145,46 +145,48 @@ void go(Queue & queue, bool useShared) {
 
     alpaka::memcpy(queue, v_d, v_h);
     alpaka::memcpy(queue, off_d, offsets_h);
-//    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
-//    cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
       std::cout << "launch for " << offsets_h[blocks] << std::endl;
 
-    auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
+    auto ntXBl = 1 == i % 4 ? 256 : 256;
 
     auto start = std::chrono::high_resolution_clock::now();
-    // TODO: manage runtime sized shared memory
-    [[maybe_unused]] constexpr int MaxSize = 256 * 32;
+    // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
+    constexpr int MaxSize = 256 * 32;
     auto workdiv = make_workdiv<Acc1D>(blocks, ntXBl);
     if (useShared)
       // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier
-      // 
-      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-              radixSortMultiWrapper<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), nullptr));
-//      cms::cuda::launch(
-//          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+      //
+      alpaka::enqueue(queue,
+                      alpaka::createTaskKernel<Acc1D>(workdiv,
+                                                      radixSortMultiWrapper<U, NS>{},
+                                                      v_d.data(),
+                                                      ind_d.data(),
+                                                      off_d.data(),
+                                                      nullptr,
+                                                      MaxSize * sizeof(uint16_t)));
     else
-//      cms::cuda::launch(
-//          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl},             v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
-      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-              radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
+      alpaka::enqueue(
+          queue,
+          alpaka::createTaskKernel<Acc1D>(
+              workdiv, radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
 
-    if (i == 0)
-      std::cout << "done for " << offsets_h[blocks] << std::endl;
+    if (i < 2)
+      std::cout << "launch done for " << offsets_h[blocks] << std::endl;
 
     alpaka::memcpy(queue, ind_h, ind_d);
     alpaka::wait(queue);
-    //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
 
     delta += std::chrono::high_resolution_clock::now() - start;
 
-    if (i == 0)
-      std::cout << "done for " << offsets_h[blocks] << std::endl;
+    if (i < 2)
+      std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl;
 
     if (32 == i) {
       std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl;
-      std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl;
+      std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]])
+                << std::endl;
       std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' '
                 << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl;
     }
@@ -194,17 +196,16 @@ void go(Queue & queue, bool useShared) {
         inds.insert(ind_h[offsets_h[ib]]);
       for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
         if (inds.count(ind_h[j]) != 0) {
-          printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n",
-                  i, ib, j, ind_h[j]);
+          printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]);
           std::vector<int> counts;
           counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0);
           for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) {
             counts[ind_h[j2]]++;
-          } 
+          }
           for (size_t j2 = 0; j2 < counts.size(); j2++) {
-            if (counts[j2]!=1)
+            if (counts[j2] != 1)
               printf("counts[%ld]=%d ", j2, counts[j2]);
-          } 
+          }
           printf("\n");
           printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j]));
         }
@@ -216,9 +217,12 @@ void go(Queue & queue, bool useShared) {
         truncate<NS>(k1);
         truncate<NS>(k2);
         if (k1 < k2) {
-          std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib]  << ", " << offsets_h[ib + 1] - 1 
-                  << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1
-                  << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl;
+          std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", "
+                    << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j]
+                    << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1
+                    << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl;
+          //sleep(2);
+          assert(false);
         }
       }
       if (!inds.empty()) {
@@ -228,7 +232,7 @@ void go(Queue & queue, bool useShared) {
       if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib]))
         std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib])
                   << std::endl;
-      // 
+      //
       assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib]));
     }
   }  // 50 times
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
index f4f4719192c12..ce85ad42cb0f4 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
@@ -10,14 +10,14 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
 // Kernel running a loop over threads/elements
 // One function with multiple flavors
-enum class RangeType {
-  Default, ExtentLimited, ExtentLimitedWithShift
-};
-  
-enum class LoopScope {
-  Block, Grid
-};
 
+// The type of elements_with_stride
+enum class RangeType { Default, ExtentLimited, ExtentLimitedWithShift };
+
+// The concurrency scope between threads
+enum class LoopScope { Block, Grid };
+
+// Utility for one time initializations
 template <LoopScope loopScope, typename TAcc>
 bool constexpr firstInLoopRange(TAcc const& acc) {
   if constexpr (loopScope == LoopScope::Block)
@@ -30,7 +30,7 @@ bool constexpr firstInLoopRange(TAcc const& acc) {
 template <RangeType rangeType, LoopScope loopScope, typename TAcc>
 size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift: 0;
+    return shift < size ? size - shift : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -41,9 +41,9 @@ size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
 }
 
 template <RangeType rangeType, LoopScope loopScope>
-size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) {
+size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shift) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift: 0;
+    return shift < size ? size - shift : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -53,12 +53,13 @@ size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shi
       return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u];
 }
 
-template<RangeType rangeType, LoopScope loopScope>
+template <RangeType rangeType, LoopScope loopScope>
 struct testWordDivisionDefaultRange {
   template <typename TAcc>
-  ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const {
-    size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
-    // Init the counter for block range. Grid range does so my mean of memset. 
+  ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t shift, size_t* globalCounter) const {
+    size_t& counter =
+        (loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
+    // Init the counter for block range. Grid range does so my mean of memset.
     if constexpr (loopScope == LoopScope::Block) {
       if (firstInLoopRange<loopScope>(acc))
         counter = 0;
@@ -66,13 +67,13 @@ struct testWordDivisionDefaultRange {
     }
     // The loop we are testing
     if constexpr (rangeType == RangeType::Default)
-      for ([[maybe_unused]] auto idx: elements_with_stride(acc))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimited)
-      for ([[maybe_unused]] auto idx: elements_with_stride(acc, size))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-      for ([[maybe_unused]]auto  idx: elements_with_stride(acc, size, shift))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc, shift, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     alpaka::syncBlockThreads(acc);
     // Check the result. Grid range will check by memcpy-ing the result.
@@ -103,44 +104,87 @@ int main() {
     ssize_t BlockSize = 512;
     size_t GridSize = 4;
     for (size_t blocks = 1; blocks < GridSize * 3; blocks++)
-      for (auto sizeFuzz: std::initializer_list<ssize_t>{ - BlockSize / 2, -1, 0, 1, BlockSize/2 })
-        for (auto shift: std::initializer_list<ssize_t>{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) {
-         // Grid level iteration: we need to initialize/check at the grid level
+      for (auto sizeFuzz :
+           std::initializer_list<ssize_t>{-10 * BlockSize / 13, -BlockSize / 2, -1, 0, 1, BlockSize / 2})
+        for (auto shift : std::initializer_list<ssize_t>{0,
+                                                         1,
+                                                         BlockSize / 2,
+                                                         BlockSize - 1,
+                                                         BlockSize,
+                                                         BlockSize + 1,
+                                                         BlockSize + BlockSize / 2,
+                                                         2 * BlockSize - 1,
+                                                         2 * BlockSize,
+                                                         2 * BlockSize + 1}) {
+          // Grid level iteration: we need to initialize/check at the grid level
           // Default range
           alpaka::memset(queue, counter_d, 0);
           auto workdiv = make_workdiv<Acc1D>(BlockSize, GridSize);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          auto expected = expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          auto expected =
+              expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // ExtentLimited range
           alpaka::memset(queue, counter_d, 0);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          expected = expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          expected =
+              expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // ExtentLimitedWithShift range
           alpaka::memset(queue, counter_d, 0);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(queue,
+                          alpaka::createTaskKernel<Acc1D>(
+                              workdiv,
+                              testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
+                              blocks * BlockSize + sizeFuzz,
+                              shift,
+                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          expected =  expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          expected = expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(
+              workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // Block level auto tests
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); 
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
+          alpaka::enqueue(queue,
+                          alpaka::createTaskKernel<Acc1D>(
+                              workdiv,
+                              testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
+                              blocks * BlockSize + sizeFuzz,
+                              shift,
+                              counter_d.data()));
         }
     alpaka::wait(queue);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 416e647c349a4..ee8048431ab6d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl(
     if (threadIdx.x == 0)
       ++p;
     __syncthreads();
-      }
+  }
 
   if ((w != 8) && (0 == (NS & 1)))
     assert(j == ind);  // w/d is even so ind is correct
@@ -261,7 +261,7 @@ namespace cms {
 
     template <typename T, int NS = sizeof(T)>
     __global__ void __launch_bounds__(256, 4)
-      radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
index 6b3ee2e4afa9c..8b6ffd70fd4e6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
@@ -74,7 +74,7 @@ namespace {
     __syncthreads();
     radixSort<float, 2>(z, order, sws, elements);
     __syncthreads();
-    
+
     //verify
     for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) {
       auto ntrack = order[itrack];
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 209ce97347e25..a15bed0ae8a1f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -151,10 +151,15 @@ void go(bool useShared) {
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
     delta -= (std::chrono::high_resolution_clock::now() - start);
+    // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
     constexpr int MaxSize = 256 * 32;
     if (useShared)
-      cms::cuda::launch(
-          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+      cms::cuda::launch(radixSortMultiWrapper<U, NS>,
+                        {blocks, ntXBl, MaxSize * 2 /* sizeof(uint16_t)  */},
+                        v_d.get(),
+                        ind_d.get(),
+                        off_d.get(),
+                        nullptr);
     else
       cms::cuda::launch(
           radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());