diff --git a/cub/device/device_histogram.cuh b/cub/device/device_histogram.cuh index 862a570028..d97dfd2e45 100644 --- a/cub/device/device_histogram.cuh +++ b/cub/device/device_histogram.cuh @@ -77,6 +77,9 @@ struct DeviceHistogram * - The ranges `[d_samples, d_samples + num_samples)` and * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap * in any way. + * - `cuda::std::common_type` must be valid, and both LevelT + * and SampleT must be valid arithmetic types. The common type must be + * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet @@ -247,6 +250,9 @@ struct DeviceHistogram * `row_end = row_begin + num_row_samples`. The ranges * `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` * shall not overlap in any way. + * - `cuda::std::common_type` must be valid, and both LevelT + * and SampleT must be valid arithmetic types. The common type must be + * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet @@ -432,6 +438,9 @@ struct DeviceHistogram * `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap * in any way. + * - `cuda::std::common_type` must be valid, and both LevelT + * and SampleT must be valid arithmetic types. The common type must be + * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet @@ -633,6 +642,9 @@ struct DeviceHistogram * `[sample_begin, sample_end)` and * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap * in any way. + * - `cuda::std::common_type` must be valid, and both LevelT + * and SampleT must be valid arithmetic types. The common type must be + * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index a22bfa2ca9..00e04efd83 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -36,15 +36,19 @@ #include #include +#include #include #include #include #include #include #include +#include #include +#include + #include #include @@ -176,25 +180,6 @@ template < typename OffsetT> ///< Signed integer type for global offsets struct DispatchHistogram { -private: - template - CUB_RUNTIME_FUNCTION - static T ComputeScale(T lower_level, T upper_level, int bins) - { - return static_cast(upper_level - lower_level) / bins; - } - -#if defined(__CUDA_FP16_TYPES_EXIST__) - // There are no host versions of arithmetic operations on `__half`, so - // all arithmetic operations on host shall be done on `float` - CUB_RUNTIME_FUNCTION - static __half ComputeScale(__half lower_level, __half upper_level, int bins) - { - return __float2half( - (__half2float(upper_level) - __half2float(lower_level)) / bins); - } -#endif - public: //--------------------------------------------------------------------- // Types and constants @@ -254,117 +239,187 @@ public: } }; - // Scales samples to evenly-spaced bins struct ScaleTransform { - int num_bins; // Number of levels in array - LevelT max; // Max sample level (exclusive) - LevelT min; // Min sample level (inclusive) - LevelT scale; // Bin scaling factor - - // Initializer - template - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - _LevelT max_, // Max sample level (exclusive) - _LevelT min_, // Min sample level (inclusive) - _LevelT scale_) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max_; - this->min = min_; - this->scale = scale_; - } - - // Initializer (float specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - float max_, // Max sample level (exclusive) - float min_, // Min sample level (inclusive) - float scale_) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max_; - this->min = min_; - this->scale = float(1.0) / scale_; - } - - // Initializer (double specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - double max_, // Max sample level (exclusive) - double min_, // Min sample level (inclusive) - double scale_) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max_; - this->min = min_; - this->scale = double(1.0) / scale_; - } - - template - static __device__ __forceinline__ void - BinSelectImpl(T sample, T min, T max, T scale, int &bin, bool valid) + private: + using CommonT = typename cuda::std::common_type::type; + static_assert(cuda::std::is_convertible::value, + "The common type of `LevelT` and `SampleT` must be " + "convertible to `int`."); + static_assert(cuda::std::is_trivially_copyable::value, + "The common type of `LevelT` and `SampleT` must be " + "trivially copyable."); + + union ScaleT + { + // Used when CommonT is not floating-point to avoid intermediate + // rounding errors (see NVIDIA/cub#489). + struct FractionT { - if (valid && (sample >= min) && (sample < max)) - { - bin = static_cast((sample - min) / scale); - } - } + CommonT bins; + CommonT range; + } fraction; + + // Used when CommonT is floating-point as an optimization. + CommonT reciprocal; + }; + + CommonT m_max; // Max sample level (exclusive) + CommonT m_min; // Min sample level (inclusive) + ScaleT m_scale; // Bin scaling + + template + __host__ __device__ __forceinline__ + ScaleT ComputeScale(int num_levels, + T max_level, + T min_level, + cuda::std::true_type /* is_fp */) + { + ScaleT result; + result.reciprocal = + static_cast(static_cast(num_levels - 1) / + static_cast(max_level - min_level)); + return result; + } + + template + __host__ __device__ __forceinline__ + ScaleT ComputeScale(int num_levels, + T max_level, + T min_level, + cuda::std::false_type /* is_fp */) + { + ScaleT result; + result.fraction.bins = static_cast(num_levels - 1); + result.fraction.range = static_cast(max_level - min_level); + return result; + } + + template + __host__ __device__ __forceinline__ + ScaleT ComputeScale(int num_levels, + T max_level, + T min_level) + { + return this->ComputeScale(num_levels, + max_level, + min_level, + cuda::std::is_floating_point{}); + } + +#ifdef __CUDA_FP16_TYPES_EXIST__ + __host__ __device__ __forceinline__ + ScaleT ComputeScale(int num_levels, + __half max_level, + __half min_level) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (return this->ComputeScale(num_levels, + max_level, + min_level, + cuda::std::true_type{});), + (return this->ComputeScale(num_levels, + __half2float(max_level), + __half2float(min_level), + cuda::std::true_type{});)); + } +#endif - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, - int &bin, - bool valid) - { - BinSelectImpl(static_cast(sample), - min, - max, - scale, - bin, - valid); - } + // All types but __half: + template + __host__ __device__ __forceinline__ + int SampleIsValid(T sample, T max_level, T min_level) + { + return sample >= min_level && sample < max_level; + } + +#ifdef __CUDA_FP16_TYPES_EXIST__ + __host__ __device__ __forceinline__ + int SampleIsValid(__half sample, __half max_level, __half min_level) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (return sample >= min_level && sample < max_level;), + (return this->SampleIsValid(__half2float(sample), + __half2float(max_level), + __half2float(min_level));)); + } +#endif -#if defined(__CUDA_FP16_TYPES_EXIST__) - template - __device__ __forceinline__ void BinSelect(__half sample, int &bin, bool valid) - { - NV_IF_TARGET(NV_PROVIDES_SM_53, - (BinSelectImpl<__half>(sample, - min, max, scale, - bin, valid);), - (BinSelectImpl(__half2float(sample), - __half2float(min), - __half2float(max), - __half2float(scale), - bin, valid);)); - } + template + __host__ __device__ __forceinline__ + int ComputeBin(T sample, + T min_level, + ScaleT scale, + cuda::std::true_type /* is_fp */) + { + return static_cast((sample - min_level) * scale.reciprocal); + } + + template + __host__ __device__ __forceinline__ + int ComputeBin(T sample, + T min_level, + ScaleT scale, + cuda::std::false_type /* is_fp */) + { + return static_cast(((sample - min_level) * scale.fraction.bins) / + scale.fraction.range); + } + + template + __host__ __device__ __forceinline__ + int ComputeBin(T sample, T min_level, ScaleT scale) + { + return this->ComputeBin(sample, + min_level, + scale, + cuda::std::is_floating_point{}); + } + +#ifdef __CUDA_FP16_TYPES_EXIST__ + __host__ __device__ __forceinline__ + int ComputeBin(__half sample, __half min_level, ScaleT scale) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (return this->ComputeBin(sample, + min_level, + scale, + cuda::std::true_type{});), + (return static_cast((__half2float(sample) - + __half2float(min_level)) * + __half2float(scale.reciprocal));)); + } #endif - // Method for converting samples to bin-ids (float specialization) - template - __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) + public: + + // Initializer + __host__ __device__ __forceinline__ void Init(int num_levels, + LevelT max_level, + LevelT min_level) + { + m_max = static_cast(max_level); + m_min = static_cast(min_level); + m_scale = this->ComputeScale(num_levels, m_max, m_min); + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(SampleT sample, + int &bin, + bool valid) + { + const CommonT common_sample = static_cast(sample); + + if (valid && this->SampleIsValid(common_sample, m_max, m_min)) { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); + bin = this->ComputeBin(common_sample, m_min, m_scale); } + } - // Method for converting samples to bin-ids (double specialization) - template - __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); - } }; - // Pass-through bin transform operator struct PassThruTransform { @@ -377,8 +432,6 @@ public: } }; - - //--------------------------------------------------------------------- // Tuning policies //--------------------------------------------------------------------- @@ -1016,10 +1069,9 @@ public: for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { - int bins = num_output_levels[channel] - 1; - LevelT scale = ComputeScale(lower_level[channel], upper_level[channel], bins); - - privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + privatized_decode_op[channel].Init(num_output_levels[channel], + upper_level[channel], + lower_level[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; @@ -1157,9 +1209,9 @@ public: { num_privatized_levels[channel] = 257; - int bins = num_output_levels[channel] - 1; - LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; - output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + output_decode_op[channel].Init(num_output_levels[channel], + upper_level[channel], + lower_level[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu index 3831e5ed51..d93dd70d32 100644 --- a/test/test_device_histogram.cu +++ b/test/test_device_histogram.cu @@ -639,7 +639,10 @@ void TestEven( OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); - printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ", + printf("%s cub::DeviceHistogram::Even (%s) " + "%d pixels (%d height, %d width, %d-byte row stride), " + "%d %d-byte %s samples (entropy reduction %d), " + "%s levels, %s counters, %d/%d channels, max sample ", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_pointer::value) ? "pointer" : "iterator", (int) (num_row_pixels * num_rows), @@ -650,6 +653,7 @@ void TestEven( (int) sizeof(SampleT), typeid(SampleT).name(), entropy_reduction, + typeid(LevelT).name(), typeid(CounterT).name(), NUM_ACTIVE_CHANNELS, NUM_CHANNELS); @@ -726,8 +730,20 @@ void TestEven( num_row_pixels, num_rows, row_stride_bytes); // Check canary zones + if (g_verbose) + { + printf("Checking leading temp_storage canary zone (token = %d)\n" + "------------------------------------------------------\n", + static_cast(canary_token)); + } int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose); AssertEquals(0, error); + if (g_verbose) + { + printf("Checking trailing temp_storage canary zone (token = %d)\n" + "-------------------------------------------------------\n", + static_cast(canary_token)); + } error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose); AssertEquals(0, error); @@ -740,6 +756,12 @@ void TestEven( // Check for correctness (and display results, if specified) for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { + if (g_verbose) + { + printf("Checking histogram result (channel = %d)\n" + "----------------------------------------\n", + channel); + } int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose); printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n"); error |= channel_error; @@ -913,19 +935,23 @@ void TestRange( OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); - printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ", - (BACKEND == CDP) ? "CDP CUB" : "CUB", - (int) (num_row_pixels * num_rows), - (int) num_rows, - (int) num_row_pixels, - (int) row_stride_bytes, - (int) total_samples, - (int) sizeof(SampleT), - typeid(SampleT).name(), - entropy_reduction, - typeid(CounterT).name(), - NUM_ACTIVE_CHANNELS, - NUM_CHANNELS); + printf("%s cub::DeviceHistogram::Range %d pixels " + "(%d height, %d width, %d-byte row stride), " + "%d %d-byte %s samples (entropy reduction %d), " + "%s levels, %s counters, %d/%d channels, max sample ", + (BACKEND == CDP) ? "CDP CUB" : "CUB", + (int)(num_row_pixels * num_rows), + (int)num_rows, + (int)num_row_pixels, + (int)row_stride_bytes, + (int)total_samples, + (int)sizeof(SampleT), + typeid(SampleT).name(), + entropy_reduction, + typeid(LevelT).name(), + typeid(CounterT).name(), + NUM_ACTIVE_CHANNELS, + NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { @@ -1263,9 +1289,13 @@ void Test( LevelT max_level, int max_num_levels) { + // entropy_reduction = -1 -> all samples == 0 Test( num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels); + Test( + num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels); + Test( num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels); } @@ -1319,6 +1349,10 @@ void Test( Test( OffsetT(0), OffsetT(0), max_level, max_num_levels); + // Small inputs + Test( + OffsetT(15), OffsetT(1), max_level, max_num_levels); + // 1080 image Test( OffsetT(1920), OffsetT(1080), max_level, max_num_levels); @@ -1442,6 +1476,72 @@ void TestLevelsAliasing() CubDebugExit(g_allocator.DeviceFree(d_levels)); } +// Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect +// bin detection: +void TestIntegerBinCalcs() +{ + constexpr int num_levels = 8; + constexpr int num_bins = num_levels - 1; + + int h_histogram[num_bins]{}; + const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0}; + const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999}; + const int lower_level = 0; + const int upper_level = 12; + + constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]); + + int *d_histogram{}; + int *d_samples{}; + + CubDebugExit( + g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram))); + + CubDebugExit( + g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples))); + + CubDebugExit( + cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice)); + + std::uint8_t *d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_samples)); + + CubDebugExit( + g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); + + CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_samples)); + + CubDebugExit(cudaMemcpy(h_histogram, + d_histogram, + sizeof(h_histogram), + cudaMemcpyDeviceToHost)); + + for (int bin = 0; bin < num_bins; ++bin) + { + AssertEquals(h_histogram_ref[bin], h_histogram[bin]); + } + + CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); + CubDebugExit(g_allocator.DeviceFree(d_histogram)); + CubDebugExit(g_allocator.DeviceFree(d_samples)); +} + //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- @@ -1477,8 +1577,9 @@ int main(int argc, char** argv) using false_t = Int2Type; TestLevelsAliasing(); + TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489 -#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA +#if TEST_HALF_T TestChannels(256, 256 + 1, true_t{}, true_t{}); #endif @@ -1489,9 +1590,11 @@ int main(int argc, char** argv) TestChannels (1.0, 256 + 1, true_t{}, false_t{}); #endif + // float samples, int levels, regression test for NVIDIA/cub#479. + TestChannels (12, 7, true_t{}, true_t{}); + // Test down-conversion of size_t offsets to int TestChannels (256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{}); return 0; } -