Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Fix DeviceHistogram::Even for mixed float/int levels and sample types.
Browse files Browse the repository at this point in the history
Fixes #479 and #489.
  • Loading branch information
alliepiper committed Aug 5, 2022
1 parent 80f5878 commit ff1b254
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 135 deletions.
12 changes: 12 additions & 0 deletions cub/device/device_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ struct DeviceHistogram
* - The ranges `[d_samples, d_samples + num_samples)` and
* `[d_histogram, d_histogram + num_levels - 1)` shall not overlap
* in any way.
* - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
* and SampleT must be valid arithmetic types. The common type must be
* convertible to `int` and trivially copyable.
* - @devicestorage
*
* @par Snippet
Expand Down Expand Up @@ -247,6 +250,9 @@ struct DeviceHistogram
* `row_end = row_begin + num_row_samples`. The ranges
* `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)`
* shall not overlap in any way.
* - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
* and SampleT must be valid arithmetic types. The common type must be
* convertible to `int` and trivially copyable.
* - @devicestorage
*
* @par Snippet
Expand Down Expand Up @@ -432,6 +438,9 @@ struct DeviceHistogram
* `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and
* `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap
* in any way.
* - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
* and SampleT must be valid arithmetic types. The common type must be
* convertible to `int` and trivially copyable.
* - @devicestorage
*
* @par Snippet
Expand Down Expand Up @@ -633,6 +642,9 @@ struct DeviceHistogram
* `[sample_begin, sample_end)` and
* `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap
* in any way.
* - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
* and SampleT must be valid arithmetic types. The common type must be
* convertible to `int` and trivially copyable.
* - @devicestorage
*
* @par Snippet
Expand Down
223 changes: 104 additions & 119 deletions cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,19 @@

#include <cub/agent/agent_histogram.cuh>
#include <cub/config.cuh>
#include <cub/detail/cpp_compatibility.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/type_traits>

#include <nv/target>

#include <cstdio>
Expand Down Expand Up @@ -176,25 +180,6 @@ template <
typename OffsetT> ///< Signed integer type for global offsets
struct DispatchHistogram
{
private:
template <class T>
CUB_RUNTIME_FUNCTION
static T ComputeScale(T lower_level, T upper_level, int bins)
{
return static_cast<T>(upper_level - lower_level) / bins;
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
// There are no host versions of arithmetic operations on `__half`, so
// all arithmetic operations on host shall be done on `float`
CUB_RUNTIME_FUNCTION
static __half ComputeScale(__half lower_level, __half upper_level, int bins)
{
return __float2half(
(__half2float(upper_level) - __half2float(lower_level)) / bins);
}
#endif

public:
//---------------------------------------------------------------------
// Types and constants
Expand Down Expand Up @@ -258,113 +243,116 @@ public:
// Scales samples to evenly-spaced bins
struct ScaleTransform
{
int num_bins; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
LevelT scale; // Bin scaling factor
private:
using CommonT = cuda::std::common_type_t<LevelT, SampleT>;
static_assert(cuda::std::is_convertible<CommonT, int>::value,
"The common type of `LevelT` and `SampleT` must be "
"convertible to `int`.");
static_assert(cuda::std::is_trivially_copyable<CommonT>::value,
"The common type of `LevelT` and `SampleT` must be "
"trivially copyable.");

static constexpr bool COMMON_T_IS_BUILTIN_FP =
cuda::std::is_floating_point<CommonT>::value;

// The `is_floating_point` trait is only valid for builtin floats, and
// evaluates to false for `__half`.
static constexpr bool COMMON_T_IS_HALF_T =
#if defined(__CUDA_FP16_TYPES_EXIST__)
cuda::std::is_same<CommonT, __half>::value;
#else
false;
#endif

// Initializer
template <typename _LevelT>
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
_LevelT max_, // Max sample level (exclusive)
_LevelT min_, // Min sample level (inclusive)
_LevelT scale_) // Bin scaling factor
union ScaleT
{
// Used when CommonT is not floating-point to avoid intermediate
// rounding errors (see NVIDIA/cub#489).
struct FractionT
{
CommonT bins;
CommonT range;
} fraction;

// Used when CommonT is floating-point as an optimization.
CommonT reciprocal;
};

CommonT m_max; // Max sample level (exclusive)
CommonT m_min; // Min sample level (inclusive)
ScaleT m_scale; // Bin scaling

public:
// Initializer
__host__ __device__ __forceinline__ void Init(int num_levels,
LevelT max_,
LevelT min_)
{
m_max = static_cast<CommonT>(max_);
m_min = static_cast<CommonT>(min_);

// Compute scale. Optimize for floating point types by pre-computing
// the reciprocal.
CUB_IF_CONSTEXPR(COMMON_T_IS_BUILTIN_FP)
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = scale_;
m_scale.reciprocal = static_cast<CommonT>(
(num_levels - 1) / (m_max - m_min));
}

// Initializer (float specialization)
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
float max_, // Max sample level (exclusive)
float min_, // Min sample level (inclusive)
float scale_) // Bin scaling factor
#if defined(__CUDA_FP16_TYPES_EXIST__)
CUB_ELSE_IF_CONSTEXPR(COMMON_T_IS_HALF_T)
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = float(1.0) / scale_;
// There are no host versions of arithmetic operations on `__half`, so
// all arithmetic operations on host shall be done on `float`.
// This is only done during initialization, and should not noticeably
// impact performance vs. using fp16 operators.
m_scale.reciprocal = __float2half(
(num_levels - 1) / (__half2float(m_max) - __half2float(m_min)));
}

// Initializer (double specialization)
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
double max_, // Max sample level (exclusive)
double min_, // Min sample level (inclusive)
double scale_) // Bin scaling factor
#endif
else
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = double(1.0) / scale_;
m_scale.fraction.bins = static_cast<CommonT>(num_levels - 1);
m_scale.fraction.range = static_cast<CommonT>(m_max - m_min);
}
}

template <typename T>
static __device__ __forceinline__ void
BinSelectImpl(T sample, T min, T max, T scale, int &bin, bool valid)
// Method for converting samples to bin-ids
template <CacheLoadModifier LOAD_MODIFIER>
__host__ __device__ __forceinline__ void BinSelect(SampleT sample,
int &bin,
bool valid)
{
CommonT common_sample = static_cast<CommonT>(sample);

if (valid && (common_sample >= m_min) && (common_sample < m_max))
{
if (valid && (sample >= min) && (sample < max))
CUB_IF_CONSTEXPR(COMMON_T_IS_BUILTIN_FP)
{
bin = static_cast<int>((sample - min) / scale);
bin =
static_cast<int>((common_sample - m_min) * m_scale.reciprocal);
}
}

// Method for converting samples to bin-ids
template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample,
int &bin,
bool valid)
{
BinSelectImpl(static_cast<LevelT>(sample),
min,
max,
scale,
bin,
valid);
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
template <CacheLoadModifier LOAD_MODIFIER>
__device__ __forceinline__ void BinSelect(__half sample, int &bin, bool valid)
{
NV_IF_TARGET(NV_PROVIDES_SM_53,
(BinSelectImpl<__half>(sample,
min, max, scale,
bin, valid);),
(BinSelectImpl<float>(__half2float(sample),
__half2float(min),
__half2float(max),
__half2float(scale),
bin, valid);));
}
CUB_ELSE_IF_CONSTEXPR(COMMON_T_IS_HALF_T)
{
NV_IF_TARGET(
NV_PROVIDES_SM_53,
(bin = static_cast<int>((common_sample - m_min) *
m_scale.reciprocal);),
(bin = static_cast<int>(
(__half2float(common_sample) - __half2float(m_min)) *
__half2float(m_scale.reciprocal));));
}
#endif

// Method for converting samples to bin-ids (float specialization)
template <CacheLoadModifier LOAD_MODIFIER>
__host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
{
LevelT level_sample = (LevelT) sample;

if (valid && (level_sample >= min) && (level_sample < max))
bin = (int) ((level_sample - min) * scale);
}

// Method for converting samples to bin-ids (double specialization)
template <CacheLoadModifier LOAD_MODIFIER>
__host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
{
LevelT level_sample = (LevelT) sample;

if (valid && (level_sample >= min) && (level_sample < max))
bin = (int) ((level_sample - min) * scale);
else
{
bin = static_cast<int>(
((common_sample - m_min) * m_scale.fraction.bins) /
m_scale.fraction.range);
}
}
}
};


// Pass-through bin transform operator
struct PassThruTransform
{
Expand All @@ -377,8 +365,6 @@ public:
}
};



//---------------------------------------------------------------------
// Tuning policies
//---------------------------------------------------------------------
Expand Down Expand Up @@ -1016,10 +1002,9 @@ public:

for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int bins = num_output_levels[channel] - 1;
LevelT scale = ComputeScale(lower_level[channel], upper_level[channel], bins);

privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
privatized_decode_op[channel].Init(num_output_levels[channel],
upper_level[channel],
lower_level[channel]);

if (num_output_levels[channel] > max_levels)
max_levels = num_output_levels[channel];
Expand Down Expand Up @@ -1157,9 +1142,9 @@ public:
{
num_privatized_levels[channel] = 257;

int bins = num_output_levels[channel] - 1;
LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
output_decode_op[channel].Init(num_output_levels[channel],
upper_level[channel],
lower_level[channel]);

if (num_output_levels[channel] > max_levels)
max_levels = num_output_levels[channel];
Expand Down
Loading

0 comments on commit ff1b254

Please sign in to comment.