diff --git a/cub/device/device_histogram.cuh b/cub/device/device_histogram.cuh
index 862a570028..d97dfd2e45 100644
--- a/cub/device/device_histogram.cuh
+++ b/cub/device/device_histogram.cuh
@@ -77,6 +77,9 @@ struct DeviceHistogram
    * - The ranges `[d_samples, d_samples + num_samples)` and 
    *   `[d_histogram, d_histogram + num_levels - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -247,6 +250,9 @@ struct DeviceHistogram
    *   `row_end = row_begin + num_row_samples`. The ranges
    *   `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` 
    *   shall not overlap in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -432,6 +438,9 @@ struct DeviceHistogram
    *   `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and 
    *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -633,6 +642,9 @@ struct DeviceHistogram
    *   `[sample_begin, sample_end)` and 
    *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
index a22bfa2ca9..00e04efd83 100644
--- a/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/device/dispatch/dispatch_histogram.cuh
@@ -36,15 +36,19 @@
 
 #include <cub/agent/agent_histogram.cuh>
 #include <cub/config.cuh>
+#include <cub/detail/cpp_compatibility.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <nv/target>
 
 #include <cstdio>
@@ -176,25 +180,6 @@ template <
     typename    OffsetT>                    ///< Signed integer type for global offsets
 struct DispatchHistogram
 {
-private:
-    template <class T>
-    CUB_RUNTIME_FUNCTION 
-    static T ComputeScale(T lower_level, T upper_level, int bins)
-    {
-      return static_cast<T>(upper_level - lower_level) / bins;
-    }
-
-#if defined(__CUDA_FP16_TYPES_EXIST__)
-    // There are no host versions of arithmetic operations on `__half`, so 
-    // all arithmetic operations on host shall be done on `float`
-    CUB_RUNTIME_FUNCTION 
-    static __half ComputeScale(__half lower_level, __half upper_level, int bins)
-    {
-      return __float2half(
-          (__half2float(upper_level) - __half2float(lower_level)) / bins);
-    }
-#endif
-
 public:
     //---------------------------------------------------------------------
     // Types and constants
@@ -254,117 +239,187 @@ public:
         }
     };
 
-
     // Scales samples to evenly-spaced bins
     struct ScaleTransform
     {
-        int    num_bins;    // Number of levels in array
-        LevelT max;         // Max sample level (exclusive)
-        LevelT min;         // Min sample level (inclusive)
-        LevelT scale;       // Bin scaling factor
-
-        // Initializer
-        template <typename _LevelT>
-        __host__ __device__ __forceinline__ void Init(
-            int     num_output_levels,  // Number of levels in array
-            _LevelT max_,                // Max sample level (exclusive)
-            _LevelT min_,                // Min sample level (inclusive)
-            _LevelT scale_)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = scale_;
-        }
-
-        // Initializer (float specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            float   max_,                // Max sample level (exclusive)
-            float   min_,                // Min sample level (inclusive)
-            float   scale_)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = float(1.0) / scale_;
-        }
-
-        // Initializer (double specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            double max_,                 // Max sample level (exclusive)
-            double min_,                 // Min sample level (inclusive)
-            double scale_)               // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = double(1.0) / scale_;
-        }
-
-        template <typename T>
-        static __device__ __forceinline__ void
-        BinSelectImpl(T sample, T min, T max, T scale, int &bin, bool valid)
+    private:
+      using CommonT = typename cuda::std::common_type<LevelT, SampleT>::type;
+      static_assert(cuda::std::is_convertible<CommonT, int>::value,
+                    "The common type of `LevelT` and `SampleT` must be "
+                    "convertible to `int`.");
+      static_assert(cuda::std::is_trivially_copyable<CommonT>::value,
+                    "The common type of `LevelT` and `SampleT` must be "
+                    "trivially copyable.");
+
+      union ScaleT
+      {
+        // Used when CommonT is not floating-point to avoid intermediate
+        // rounding errors (see NVIDIA/cub#489).
+        struct FractionT
         {
-          if (valid && (sample >= min) && (sample < max))
-          {
-            bin = static_cast<int>((sample - min) / scale);
-          }
-        }
+          CommonT bins;
+          CommonT range;
+        } fraction;
+
+        // Used when CommonT is floating-point as an optimization.
+        CommonT reciprocal;
+      };
+
+      CommonT m_max;   // Max sample level (exclusive)
+      CommonT m_min;   // Min sample level (inclusive)
+      ScaleT  m_scale; // Bin scaling
+
+      template <typename T>
+      __host__ __device__ __forceinline__
+      ScaleT ComputeScale(int num_levels,
+                          T   max_level,
+                          T   min_level,
+                          cuda::std::true_type /* is_fp */)
+      {
+        ScaleT result;
+        result.reciprocal =
+          static_cast<T>(static_cast<T>(num_levels - 1) /
+                         static_cast<T>(max_level - min_level));
+        return result;
+      }
+
+      template <typename T>
+      __host__ __device__ __forceinline__
+      ScaleT ComputeScale(int num_levels,
+                          T   max_level,
+                          T   min_level,
+                          cuda::std::false_type /* is_fp */)
+      {
+        ScaleT result;
+        result.fraction.bins  = static_cast<T>(num_levels - 1);
+        result.fraction.range = static_cast<T>(max_level - min_level);
+        return result;
+      }
+
+      template <typename T>
+      __host__ __device__ __forceinline__
+      ScaleT ComputeScale(int num_levels,
+                          T   max_level,
+                          T   min_level)
+      {
+        return this->ComputeScale(num_levels,
+                                  max_level,
+                                  min_level,
+                                  cuda::std::is_floating_point<T>{});
+      }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+      __host__ __device__ __forceinline__
+      ScaleT ComputeScale(int    num_levels,
+                          __half max_level,
+                          __half min_level)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_53,
+                     (return this->ComputeScale(num_levels,
+                                                max_level,
+                                                min_level,
+                                                cuda::std::true_type{});),
+                     (return this->ComputeScale(num_levels,
+                                                __half2float(max_level),
+                                                __half2float(min_level),
+                                                cuda::std::true_type{});));
+      }
+#endif
 
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample,
-                                                           int &bin,
-                                                           bool valid)
-        {
-          BinSelectImpl(static_cast<LevelT>(sample),
-                        min,
-                        max,
-                        scale,
-                        bin,
-                        valid);
-        }
+      // All types but __half:
+      template <typename T>
+      __host__ __device__ __forceinline__
+      int SampleIsValid(T sample, T max_level, T min_level)
+      {
+        return sample >= min_level && sample < max_level;
+      }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+      __host__ __device__ __forceinline__
+      int SampleIsValid(__half sample, __half max_level, __half min_level)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_53,
+                     (return sample >= min_level && sample < max_level;),
+                     (return this->SampleIsValid(__half2float(sample),
+                                                 __half2float(max_level),
+                                                 __half2float(min_level));));
+      }
+#endif
 
-#if defined(__CUDA_FP16_TYPES_EXIST__)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __device__ __forceinline__ void BinSelect(__half sample, int &bin, bool valid)
-        {
-          NV_IF_TARGET(NV_PROVIDES_SM_53,
-                       (BinSelectImpl<__half>(sample, 
-                                              min, max, scale, 
-                                              bin, valid);),
-                       (BinSelectImpl<float>(__half2float(sample), 
-                                             __half2float(min),
-                                             __half2float(max),
-                                             __half2float(scale),
-                                             bin, valid);));
-        }
+      template <typename T>
+      __host__ __device__ __forceinline__
+      int ComputeBin(T      sample,
+                     T      min_level,
+                     ScaleT scale,
+                     cuda::std::true_type /* is_fp */)
+      {
+        return static_cast<int>((sample - min_level) * scale.reciprocal);
+      }
+
+      template <typename T>
+      __host__ __device__ __forceinline__
+      int ComputeBin(T      sample,
+                     T      min_level,
+                     ScaleT scale,
+                     cuda::std::false_type /* is_fp */)
+      {
+        return static_cast<int>(((sample - min_level) * scale.fraction.bins) /
+                                scale.fraction.range);
+      }
+
+      template <typename T>
+      __host__ __device__ __forceinline__
+      int ComputeBin(T sample, T min_level, ScaleT scale)
+      {
+        return this->ComputeBin(sample,
+                                min_level,
+                                scale,
+                                cuda::std::is_floating_point<T>{});
+      }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+      __host__ __device__ __forceinline__
+      int ComputeBin(__half sample, __half min_level, ScaleT scale)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_53,
+                     (return this->ComputeBin(sample,
+                                              min_level,
+                                              scale,
+                                              cuda::std::true_type{});),
+                     (return static_cast<int>((__half2float(sample) -
+                                               __half2float(min_level)) *
+                                              __half2float(scale.reciprocal));));
+      }
 #endif
 
-        // Method for converting samples to bin-ids (float specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+    public:
+
+      // Initializer
+      __host__ __device__ __forceinline__ void Init(int    num_levels,
+                                                    LevelT max_level,
+                                                    LevelT min_level)
+      {
+        m_max = static_cast<CommonT>(max_level);
+        m_min = static_cast<CommonT>(min_level);
+        m_scale = this->ComputeScale(num_levels, m_max, m_min);
+      }
+
+      // Method for converting samples to bin-ids
+      template <CacheLoadModifier LOAD_MODIFIER>
+      __host__ __device__ __forceinline__ void BinSelect(SampleT sample,
+                                                         int    &bin,
+                                                         bool    valid)
+      {
+        const CommonT common_sample = static_cast<CommonT>(sample);
+
+        if (valid && this->SampleIsValid(common_sample, m_max, m_min))
         {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
+          bin = this->ComputeBin(common_sample, m_min, m_scale);
         }
+      }
 
-        // Method for converting samples to bin-ids (double specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
     };
 
-
     // Pass-through bin transform operator
     struct PassThruTransform
     {
@@ -377,8 +432,6 @@ public:
         }
     };
 
-
-
     //---------------------------------------------------------------------
     // Tuning policies
     //---------------------------------------------------------------------
@@ -1016,10 +1069,9 @@ public:
 
             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
             {
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = ComputeScale(lower_level[channel], upper_level[channel], bins);
-
-                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+                privatized_decode_op[channel].Init(num_output_levels[channel],
+                                                   upper_level[channel],
+                                                   lower_level[channel]);
 
                 if (num_output_levels[channel] > max_levels)
                     max_levels = num_output_levels[channel];
@@ -1157,9 +1209,9 @@ public:
             {
                 num_privatized_levels[channel] = 257;
 
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+                output_decode_op[channel].Init(num_output_levels[channel],
+                                               upper_level[channel],
+                                               lower_level[channel]);
 
                 if (num_output_levels[channel] > max_levels)
                     max_levels = num_output_levels[channel];
diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu
index 3831e5ed51..d93dd70d32 100644
--- a/test/test_device_histogram.cu
+++ b/test/test_device_histogram.cu
@@ -639,7 +639,10 @@ void TestEven(
     OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
 
     printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+    printf("%s cub::DeviceHistogram::Even (%s) "
+           "%d pixels (%d height, %d width, %d-byte row stride), "
+           "%d %d-byte %s samples (entropy reduction %d), "
+           "%s levels, %s counters, %d/%d channels, max sample ",
         (BACKEND == CDP) ? "CDP CUB" : "CUB",
         (std::is_pointer<SampleIteratorT>::value) ? "pointer" : "iterator",
         (int) (num_row_pixels * num_rows),
@@ -650,6 +653,7 @@ void TestEven(
         (int) sizeof(SampleT),
         typeid(SampleT).name(),
         entropy_reduction,
+        typeid(LevelT).name(),
         typeid(CounterT).name(),
         NUM_ACTIVE_CHANNELS,
         NUM_CHANNELS);
@@ -726,8 +730,20 @@ void TestEven(
         num_row_pixels, num_rows, row_stride_bytes);
 
     // Check canary zones
+    if (g_verbose)
+    {
+        printf("Checking leading temp_storage canary zone (token = %d)\n"
+               "------------------------------------------------------\n",
+               static_cast<int>(canary_token));
+    }
     int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
     AssertEquals(0, error);
+    if (g_verbose)
+    {
+        printf("Checking trailing temp_storage canary zone (token = %d)\n"
+               "-------------------------------------------------------\n",
+               static_cast<int>(canary_token));
+    }
     error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
     AssertEquals(0, error);
 
@@ -740,6 +756,12 @@ void TestEven(
     // Check for correctness (and display results, if specified)
     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
     {
+        if (g_verbose)
+        {
+            printf("Checking histogram result (channel = %d)\n"
+                   "----------------------------------------\n",
+                   channel);
+        }
         int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
         printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
         error |= channel_error;
@@ -913,19 +935,23 @@ void TestRange(
     OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
 
     printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
-        (BACKEND == CDP) ? "CDP CUB" : "CUB",
-        (int) (num_row_pixels * num_rows),
-        (int) num_rows,
-        (int) num_row_pixels,
-        (int) row_stride_bytes,
-        (int) total_samples,
-        (int) sizeof(SampleT),
-        typeid(SampleT).name(),
-        entropy_reduction,
-        typeid(CounterT).name(),
-        NUM_ACTIVE_CHANNELS,
-        NUM_CHANNELS);
+    printf("%s cub::DeviceHistogram::Range %d pixels "
+           "(%d height, %d width, %d-byte row stride), "
+           "%d %d-byte %s samples (entropy reduction %d), "
+           "%s levels, %s counters, %d/%d channels, max sample ",
+           (BACKEND == CDP) ? "CDP CUB" : "CUB",
+           (int)(num_row_pixels * num_rows),
+           (int)num_rows,
+           (int)num_row_pixels,
+           (int)row_stride_bytes,
+           (int)total_samples,
+           (int)sizeof(SampleT),
+           typeid(SampleT).name(),
+           entropy_reduction,
+           typeid(LevelT).name(),
+           typeid(CounterT).name(),
+           NUM_ACTIVE_CHANNELS,
+           NUM_CHANNELS);
     std::cout << CoutCast(max_level) << "\n";
     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
     {
@@ -1263,9 +1289,13 @@ void Test(
     LevelT          max_level,
     int             max_num_levels)
 {
+    // entropy_reduction = -1 -> all samples == 0
     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
         num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
 
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 0,  max_level, max_num_levels);
+
     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
         num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
 }
@@ -1319,6 +1349,10 @@ void Test(
     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
         OffsetT(0), OffsetT(0), max_level, max_num_levels);
 
+    // Small inputs
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+      OffsetT(15), OffsetT(1), max_level, max_num_levels);
+
     // 1080 image
     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
         OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
@@ -1442,6 +1476,72 @@ void TestLevelsAliasing()
   CubDebugExit(g_allocator.DeviceFree(d_levels));
 }
 
+// Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect
+// bin detection:
+void TestIntegerBinCalcs()
+{
+  constexpr int num_levels = 8;
+  constexpr int num_bins = num_levels - 1;
+
+  int h_histogram[num_bins]{};
+  const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0};
+  const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999};
+  const int lower_level = 0;
+  const int upper_level = 12;
+
+  constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]);
+
+  int *d_histogram{};
+  int *d_samples{};
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram)));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples)));
+
+  CubDebugExit(
+    cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice));
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_samples,
+                                                   d_histogram,
+                                                   num_levels,
+                                                   lower_level,
+                                                   upper_level,
+                                                   num_samples));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_samples,
+                                                    d_histogram,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    num_samples));
+
+  CubDebugExit(cudaMemcpy(h_histogram,
+                          d_histogram,
+                          sizeof(h_histogram),
+                          cudaMemcpyDeviceToHost));
+
+  for (int bin = 0; bin < num_bins; ++bin)
+  {
+    AssertEquals(h_histogram_ref[bin], h_histogram[bin]);
+  }
+
+  CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  CubDebugExit(g_allocator.DeviceFree(d_histogram));
+  CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -1477,8 +1577,9 @@ int main(int argc, char** argv)
     using false_t = Int2Type<false>;
 
     TestLevelsAliasing();
+    TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489
 
-#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA
+#if TEST_HALF_T
     TestChannels<half_t, int, half_t, int>(256, 256 + 1, true_t{}, true_t{});
 #endif
 
@@ -1489,9 +1590,11 @@ int main(int argc, char** argv)
     TestChannels <float,            int, float, int>(1.0,   256 + 1,  true_t{}, false_t{});
 #endif
 
+    // float samples, int levels, regression test for NVIDIA/cub#479.
+    TestChannels <float,            int, int,   int>(12,    7,        true_t{}, true_t{});
+
     // Test down-conversion of size_t offsets to int
     TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{});
 
     return 0;
 }
-