Skip to content

Commit

Permalink
Fix DeviceHistogram::Even for mixed float/int levels and sample types.
Browse files Browse the repository at this point in the history
Fixes NVIDIA#479.
  • Loading branch information
alliepiper committed May 19, 2022
1 parent f80aa78 commit b0b8adb
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 83 deletions.
26 changes: 26 additions & 0 deletions cub/detail/cpp_compatibility.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright 2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#pragma once

#include <cub/util_cpp_dialect.cuh>

#if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr
# define CUB_IF_CONSTEXPR if constexpr
#else
# define CUB_IF_CONSTEXPR if
#endif
118 changes: 49 additions & 69 deletions cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,19 @@
#pragma once

#include <cub/agent/agent_histogram.cuh>
#include <cub/config.cuh>
#include <cub/detail/cpp_compatibility.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/config.cuh>
#include <cub/util_type.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/type_traits>

#include <nv/target>

#include <cstdio>
Expand Down Expand Up @@ -237,79 +241,56 @@ struct DispatchHistogram
// Scales samples to evenly-spaced bins
struct ScaleTransform
{
using CommonT = typename cuda::std::common_type<LevelT, SampleT>::type;

static constexpr bool PRECOMPUTE_RECIPROCAL =
cuda::std::is_floating_point<CommonT>::value;

int num_bins; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
LevelT scale; // Bin scaling factor
CommonT max; // Max sample level (exclusive)
CommonT min; // Min sample level (inclusive)
CommonT scale; // Bin scaling factor

// Initializer
template <typename _LevelT>
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
_LevelT max_, // Max sample level (exclusive)
_LevelT min_, // Min sample level (inclusive)
_LevelT scale_) // Bin scaling factor
LevelT max_, // Max sample level (exclusive)
LevelT min_) // Min sample level (inclusive)
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = scale_;
}
this->max = static_cast<CommonT>(max_);
this->min = static_cast<CommonT>(min_);

// Initializer (float specialization)
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
float max_, // Max sample level (exclusive)
float min_, // Min sample level (inclusive)
float scale_) // Bin scaling factor
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = float(1.0) / scale_;
}

// Initializer (double specialization)
__host__ __device__ __forceinline__ void Init(
int num_output_levels, // Number of levels in array
double max_, // Max sample level (exclusive)
double min_, // Min sample level (inclusive)
double scale_) // Bin scaling factor
{
this->num_bins = num_output_levels - 1;
this->max = max_;
this->min = min_;
this->scale = double(1.0) / scale_;
// Optimize for floating point by precomputing the reciprocal:
CUB_IF_CONSTEXPR(PRECOMPUTE_RECIPROCAL)
{
this->scale = static_cast<CommonT>(num_output_levels - 1) /
static_cast<CommonT>(max_ - min_);
}
else
{
this->scale = static_cast<CommonT>(max_ - min_) /
static_cast<CommonT>(num_output_levels - 1);
}
}

// Method for converting samples to bin-ids
template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
{
LevelT level_sample = (LevelT) sample;

if (valid && (level_sample >= min) && (level_sample < max))
bin = (int) ((level_sample - min) / scale);
}

// Method for converting samples to bin-ids (float specialization)
template <CacheLoadModifier LOAD_MODIFIER>
__host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
__host__ __device__ __forceinline__ void BinSelect(SampleT sample, int &bin, bool valid)
{
LevelT level_sample = (LevelT) sample;

if (valid && (level_sample >= min) && (level_sample < max))
bin = (int) ((level_sample - min) * scale);
}
CommonT common_sample = static_cast<CommonT>(sample);

// Method for converting samples to bin-ids (double specialization)
template <CacheLoadModifier LOAD_MODIFIER>
__host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
{
LevelT level_sample = (LevelT) sample;

if (valid && (level_sample >= min) && (level_sample < max))
bin = (int) ((level_sample - min) * scale);
if (valid && (common_sample >= min) && (common_sample < max))
{
CUB_IF_CONSTEXPR(PRECOMPUTE_RECIPROCAL)
{
bin = static_cast<int>((common_sample - min) * scale);
}
else
{
bin = static_cast<int>((common_sample - min) / scale);
}
}
}
};

Expand Down Expand Up @@ -864,10 +845,9 @@ struct DispatchHistogram

for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int bins = num_output_levels[channel] - 1;
LevelT scale = static_cast<LevelT>((upper_level[channel] - lower_level[channel]) / bins);

privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
privatized_decode_op[channel].Init(num_output_levels[channel],
upper_level[channel],
lower_level[channel]);

if (num_output_levels[channel] > max_levels)
max_levels = num_output_levels[channel];
Expand Down Expand Up @@ -975,9 +955,9 @@ struct DispatchHistogram
{
num_privatized_levels[channel] = 257;

int bins = num_output_levels[channel] - 1;
LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
output_decode_op[channel].Init(num_output_levels[channel],
upper_level[channel],
lower_level[channel]);

if (num_output_levels[channel] > max_levels)
max_levels = num_output_levels[channel];
Expand Down
65 changes: 51 additions & 14 deletions test/test_device_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,10 @@ void TestEven(
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));

printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
printf("%s cub::DeviceHistogram::Even (%s) "
"%d pixels (%d height, %d width, %d-byte row stride), "
"%d %d-byte %s samples (entropy reduction %d), "
"%s levels, %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : "CUB",
(std::is_pointer<SampleIteratorT>::value) ? "pointer" : "iterator",
(int) (num_row_pixels * num_rows),
Expand All @@ -579,6 +582,7 @@ void TestEven(
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(LevelT).name(),
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
Expand Down Expand Up @@ -657,8 +661,20 @@ void TestEven(
0, true);

// Check canary zones
if (g_verbose)
{
printf("Checking leading temp_storage canary zone (token = %d)\n"
"------------------------------------------------------\n",
static_cast<int>(canary_token));
}
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
if (g_verbose)
{
printf("Checking trailing temp_storage canary zone (token = %d)\n"
"-------------------------------------------------------\n",
static_cast<int>(canary_token));
}
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);

Expand All @@ -671,6 +687,12 @@ void TestEven(
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (g_verbose)
{
printf("Checking histogram result (channel = %d)\n"
"----------------------------------------\n",
channel);
}
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
Expand Down Expand Up @@ -825,19 +847,23 @@ void TestRange(
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));

printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : "CUB",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
printf("%s cub::DeviceHistogram::Range %d pixels "
"(%d height, %d width, %d-byte row stride), "
"%d %d-byte %s samples (entropy reduction %d), "
"%s levels, %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : "CUB",
(int)(num_row_pixels * num_rows),
(int)num_rows,
(int)num_row_pixels,
(int)row_stride_bytes,
(int)total_samples,
(int)sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(LevelT).name(),
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
Expand Down Expand Up @@ -1178,9 +1204,13 @@ void Test(
LevelT max_level,
int max_num_levels)
{
// entropy_reduction = -1 -> all samples == 0
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels);

Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels);

Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels);
}
Expand Down Expand Up @@ -1234,6 +1264,10 @@ void Test(
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(0), OffsetT(0), max_level, max_num_levels);

// Small inputs
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(15), OffsetT(1), max_level, max_num_levels);

// 1080 image
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
Expand Down Expand Up @@ -1327,6 +1361,9 @@ int main(int argc, char** argv)
TestChannels <float, int, float, int>(1.0, 256 + 1, true_t{}, false_t{});
#endif

// float samples, int levels, regression test for NVIDIA/cub#479.
TestChannels <float, int, int, int>(12, 7, true_t{}, true_t{});

// Test down-conversion of size_t offsets to int
TestChannels <unsigned char, int, int, long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{});

Expand Down

0 comments on commit b0b8adb

Please sign in to comment.