Fix DeviceHistogram::Even for mixed float/int levels and sample types.

Fixes #479 and #489.
NVIDIA · Aug 5, 2022 · ff1b254 · ff1b254
1 parent 80f5878
commit ff1b254
Show file tree

Hide file tree

Showing 3 changed files with 235 additions and 135 deletions.
diff --git a/cub/device/device_histogram.cuh b/cub/device/device_histogram.cuh
@@ -77,6 +77,9 @@ struct DeviceHistogram
    * - The ranges `[d_samples, d_samples + num_samples)` and 
    *   `[d_histogram, d_histogram + num_levels - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -247,6 +250,9 @@ struct DeviceHistogram
    *   `row_end = row_begin + num_row_samples`. The ranges
    *   `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` 
    *   shall not overlap in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -432,6 +438,9 @@ struct DeviceHistogram
    *   `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and 
    *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet
@@ -633,6 +642,9 @@ struct DeviceHistogram
    *   `[sample_begin, sample_end)` and 
    *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap 
    *   in any way.
+   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
+   *   and SampleT must be valid arithmetic types. The common type must be
+   *   convertible to `int` and trivially copyable.
    * - @devicestorage
    *
    * @par Snippet

diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
@@ -36,15 +36,19 @@
 
 #include <cub/agent/agent_histogram.cuh>
 #include <cub/config.cuh>
+#include <cub/detail/cpp_compatibility.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <nv/target>
 
 #include <cstdio>
@@ -176,25 +180,6 @@ template <
     typename    OffsetT>                    ///< Signed integer type for global offsets
 struct DispatchHistogram
 {
-private:
-    template <class T>
-    CUB_RUNTIME_FUNCTION 
-    static T ComputeScale(T lower_level, T upper_level, int bins)
-    {
-      return static_cast<T>(upper_level - lower_level) / bins;
-    }
-
-#if defined(__CUDA_FP16_TYPES_EXIST__)
-    // There are no host versions of arithmetic operations on `__half`, so 
-    // all arithmetic operations on host shall be done on `float`
-    CUB_RUNTIME_FUNCTION 
-    static __half ComputeScale(__half lower_level, __half upper_level, int bins)
-    {
-      return __float2half(
-          (__half2float(upper_level) - __half2float(lower_level)) / bins);
-    }
-#endif
-
 public:
     //---------------------------------------------------------------------
     // Types and constants
@@ -258,113 +243,116 @@ public:
     // Scales samples to evenly-spaced bins
     struct ScaleTransform
     {
-        int    num_bins;    // Number of levels in array
-        LevelT max;         // Max sample level (exclusive)
-        LevelT min;         // Min sample level (inclusive)
-        LevelT scale;       // Bin scaling factor
+    private:
+      using CommonT = cuda::std::common_type_t<LevelT, SampleT>;
+      static_assert(cuda::std::is_convertible<CommonT, int>::value,
+                    "The common type of `LevelT` and `SampleT` must be "
+                    "convertible to `int`.");
+      static_assert(cuda::std::is_trivially_copyable<CommonT>::value,
+                    "The common type of `LevelT` and `SampleT` must be "
+                    "trivially copyable.");
+
+      static constexpr bool COMMON_T_IS_BUILTIN_FP =
+        cuda::std::is_floating_point<CommonT>::value;
+
+      // The `is_floating_point` trait is only valid for builtin floats, and
+      // evaluates to false for `__half`.
+      static constexpr bool COMMON_T_IS_HALF_T =
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+        cuda::std::is_same<CommonT, __half>::value;
+#else
+        false;
+#endif
 
-        // Initializer
-        template <typename _LevelT>
-        __host__ __device__ __forceinline__ void Init(
-            int     num_output_levels,  // Number of levels in array
-            _LevelT max_,                // Max sample level (exclusive)
-            _LevelT min_,                // Min sample level (inclusive)
-            _LevelT scale_)              // Bin scaling factor
+      union ScaleT
+      {
+        // Used when CommonT is not floating-point to avoid intermediate
+        // rounding errors (see NVIDIA/cub#489).
+        struct FractionT
+        {
+          CommonT bins;
+          CommonT range;
+        } fraction;
+
+        // Used when CommonT is floating-point as an optimization.
+        CommonT reciprocal;
+      };
+
+      CommonT m_max;   // Max sample level (exclusive)
+      CommonT m_min;   // Min sample level (inclusive)
+      ScaleT  m_scale; // Bin scaling
+
+    public:
+      // Initializer
+      __host__ __device__ __forceinline__ void Init(int    num_levels,
+                                                    LevelT max_,
+                                                    LevelT min_)
+      {
+        m_max = static_cast<CommonT>(max_);
+        m_min = static_cast<CommonT>(min_);
+
+        // Compute scale. Optimize for floating point types by pre-computing
+        // the reciprocal.
+        CUB_IF_CONSTEXPR(COMMON_T_IS_BUILTIN_FP)
         {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = scale_;
+          m_scale.reciprocal = static_cast<CommonT>(
+            (num_levels - 1) / (m_max - m_min));
         }
-
-        // Initializer (float specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            float   max_,                // Max sample level (exclusive)
-            float   min_,                // Min sample level (inclusive)
-            float   scale_)              // Bin scaling factor
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+        CUB_ELSE_IF_CONSTEXPR(COMMON_T_IS_HALF_T)
         {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = float(1.0) / scale_;
+          // There are no host versions of arithmetic operations on `__half`, so
+          // all arithmetic operations on host shall be done on `float`.
+          // This is only done during initialization, and should not noticeably
+          // impact performance vs. using fp16 operators.
+          m_scale.reciprocal = __float2half(
+            (num_levels - 1) / (__half2float(m_max) - __half2float(m_min)));
         }
-
-        // Initializer (double specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            double max_,                 // Max sample level (exclusive)
-            double min_,                 // Min sample level (inclusive)
-            double scale_)               // Bin scaling factor
+#endif
+        else
         {
-            this->num_bins = num_output_levels - 1;
-            this->max = max_;
-            this->min = min_;
-            this->scale = double(1.0) / scale_;
+          m_scale.fraction.bins  = static_cast<CommonT>(num_levels - 1);
+          m_scale.fraction.range = static_cast<CommonT>(m_max - m_min);
         }
+      }
 
-        template <typename T>
-        static __device__ __forceinline__ void
-        BinSelectImpl(T sample, T min, T max, T scale, int &bin, bool valid)
+      // Method for converting samples to bin-ids
+      template <CacheLoadModifier LOAD_MODIFIER>
+      __host__ __device__ __forceinline__ void BinSelect(SampleT sample,
+                                                         int    &bin,
+                                                         bool    valid)
+      {
+        CommonT common_sample = static_cast<CommonT>(sample);
+
+        if (valid && (common_sample >= m_min) && (common_sample < m_max))
         {
-          if (valid && (sample >= min) && (sample < max))
+          CUB_IF_CONSTEXPR(COMMON_T_IS_BUILTIN_FP)
           {
-            bin = static_cast<int>((sample - min) / scale);
+            bin =
+              static_cast<int>((common_sample - m_min) * m_scale.reciprocal);
           }
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample,
-                                                           int &bin,
-                                                           bool valid)
-        {
-          BinSelectImpl(static_cast<LevelT>(sample),
-                        min,
-                        max,
-                        scale,
-                        bin,
-                        valid);
-        }
-
 #if defined(__CUDA_FP16_TYPES_EXIST__)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __device__ __forceinline__ void BinSelect(__half sample, int &bin, bool valid)
-        {
-          NV_IF_TARGET(NV_PROVIDES_SM_53,
-                       (BinSelectImpl<__half>(sample, 
-                                              min, max, scale, 
-                                              bin, valid);),
-                       (BinSelectImpl<float>(__half2float(sample), 
-                                             __half2float(min),
-                                             __half2float(max),
-                                             __half2float(scale),
-                                             bin, valid);));
-        }
+          CUB_ELSE_IF_CONSTEXPR(COMMON_T_IS_HALF_T)
+          {
+            NV_IF_TARGET(
+              NV_PROVIDES_SM_53,
+              (bin = static_cast<int>((common_sample - m_min) *
+                                      m_scale.reciprocal);),
+              (bin = static_cast<int>(
+                 (__half2float(common_sample) - __half2float(m_min)) *
+                 __half2float(m_scale.reciprocal));));
+          }
 #endif
-
-        // Method for converting samples to bin-ids (float specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-
-        // Method for converting samples to bin-ids (double specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
+          else
+          {
+            bin = static_cast<int>(
+              ((common_sample - m_min) * m_scale.fraction.bins) /
+              m_scale.fraction.range);
+          }
         }
+      }
     };
 
-
     // Pass-through bin transform operator
     struct PassThruTransform
     {
@@ -377,8 +365,6 @@ public:
         }
     };
 
-
-
     //---------------------------------------------------------------------
     // Tuning policies
     //---------------------------------------------------------------------
@@ -1016,10 +1002,9 @@ public:
 
             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
             {
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = ComputeScale(lower_level[channel], upper_level[channel], bins);
-
-                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+                privatized_decode_op[channel].Init(num_output_levels[channel],
+                                                   upper_level[channel],
+                                                   lower_level[channel]);
 
                 if (num_output_levels[channel] > max_levels)
                     max_levels = num_output_levels[channel];
@@ -1157,9 +1142,9 @@ public:
             {
                 num_privatized_levels[channel] = 257;
 
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+                output_decode_op[channel].Init(num_output_levels[channel],
+                                               upper_level[channel],
+                                               lower_level[channel]);
 
                 if (num_output_levels[channel] > max_levels)
                     max_levels = num_output_levels[channel];