Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #301 from zasdfgbnm/scan-inplace
Browse files Browse the repository at this point in the history
Document that cub's device scan supports inplace operations, and add tests to enforce this feature.
  • Loading branch information
alliepiper authored May 20, 2021
2 parents e50fee7 + 39a38e8 commit ad5299d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 3 deletions.
2 changes: 1 addition & 1 deletion cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace cub {
* idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
* of global prefix propagation with local computation. As such, our algorithm requires only
* ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
* proceeds at "memcpy" speeds.
* proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
*
* \par
* [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
Expand Down
48 changes: 46 additions & 2 deletions test/test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,19 @@ void Solve(
}
}

template<typename OutputT, typename DeviceInputIteratorT, bool InPlace>
struct AllocateOutput {
static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) {
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
}
};

template<typename OutputT>
struct AllocateOutput<OutputT, OutputT *, true> {
static void run(OutputT *&d_out, OutputT *d_in, int num_items) {
d_out = d_in;
}
};

/**
* Test DeviceScan for a given problem input
Expand All @@ -613,7 +626,8 @@ template <
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename InitialValueT,
bool InPlace=false>
void Test(
DeviceInputIteratorT d_in,
OutputT *h_reference,
Expand All @@ -625,7 +639,7 @@ void Test(

// Allocate device output array
OutputT *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
AllocateOutput<OutputT, DeviceInputIteratorT, InPlace>::run(d_out, d_in, num_items);

// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
Expand Down Expand Up @@ -723,6 +737,35 @@ void Test(
AssertEquals(0, compare);
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
auto TestInplace(
DeviceInputIteratorT d_in,
OutputT *h_reference,
int num_items,
ScanOpT scan_op,
InitialValueT initial_value) -> typename std::enable_if<std::is_same<decltype(*d_in), OutputT>::value>::type
{
Test<BACKEND, DeviceInputIteratorT, OutputT, ScanOpT, InitialValueT, true>(d_in, h_reference, num_items, scan_op, initial_value);
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
auto TestInplace(
DeviceInputIteratorT d_in,
OutputT *,
int,
ScanOpT,
InitialValueT) -> typename std::enable_if<!std::is_same<decltype(*d_in), OutputT>::value>::type
{}

/**
* Test DeviceScan on pointer type
Expand Down Expand Up @@ -780,6 +823,7 @@ void TestPointer(

// Run Test
Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
TestInplace<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);

// Cleanup
if (h_in) delete[] h_in;
Expand Down

0 comments on commit ad5299d

Please sign in to comment.