Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Document that cub's device scan supports inplace operations, and add tests to enforce this feature. #301

Merged
merged 3 commits into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace cub {
* idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
* of global prefix propagation with local computation. As such, our algorithm requires only
* ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
* proceeds at "memcpy" speeds.
* proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
*
* \par
* [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
Expand Down
48 changes: 46 additions & 2 deletions test/test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,19 @@ void Solve(
}
}

template<typename OutputT, typename DeviceInputIteratorT, bool InPlace>
struct AllocateOutput {
static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) {
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
}
};

template<typename OutputT>
struct AllocateOutput<OutputT, OutputT *, true> {
static void run(OutputT *&d_out, OutputT *d_in, int num_items) {
d_out = d_in;
}
};

/**
* Test DeviceScan for a given problem input
Expand All @@ -613,7 +626,8 @@ template <
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename InitialValueT,
bool InPlace=false>
void Test(
DeviceInputIteratorT d_in,
OutputT *h_reference,
Expand All @@ -625,7 +639,7 @@ void Test(

// Allocate device output array
OutputT *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
AllocateOutput<OutputT, DeviceInputIteratorT, InPlace>::run(d_out, d_in, num_items);

// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
Expand Down Expand Up @@ -723,6 +737,35 @@ void Test(
AssertEquals(0, compare);
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
auto TestInplace(
DeviceInputIteratorT d_in,
OutputT *h_reference,
int num_items,
ScanOpT scan_op,
InitialValueT initial_value) -> typename std::enable_if<std::is_same<decltype(*d_in), OutputT>::value>::type
{
Test<BACKEND, DeviceInputIteratorT, OutputT, ScanOpT, InitialValueT, true>(d_in, h_reference, num_items, scan_op, initial_value);
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
auto TestInplace(
DeviceInputIteratorT d_in,
OutputT *,
int,
ScanOpT,
InitialValueT) -> typename std::enable_if<!std::is_same<decltype(*d_in), OutputT>::value>::type
{}

/**
* Test DeviceScan on pointer type
Expand Down Expand Up @@ -780,6 +823,7 @@ void TestPointer(

// Run Test
Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
TestInplace<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);

// Cleanup
if (h_in) delete[] h_in;
Expand Down