diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 2dceba653..93ddde1a0 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace cuco { namespace detail { diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c4e520446..04771d3d8 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -15,6 +15,7 @@ */ #include +#include namespace cuco { @@ -914,8 +915,8 @@ class static_multimap::device_view_ if (first_exists or second_exists) { if constexpr (is_outer) { found_match = true; } - auto num_first_matches = __popc(first_exists); - auto num_second_matches = __popc(second_exists); + auto const num_first_matches = __popc(first_exists); + auto const num_second_matches = __popc(second_exists); uint32_t output_idx; if (0 == cg_lane_id) { @@ -924,14 +925,15 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << cg_lane_id) - 1)); - Key key = k; + auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); + Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[0].second)); } if (second_equals) { - auto lane_offset = __popc(second_exists & ((1 << cg_lane_id) - 1)); - Key key = k; + auto const lane_offset = + detail::count_least_significant_bits(second_exists, cg_lane_id); + Key key = k; output_buffer[output_idx + num_first_matches + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[1].second)); } @@ -940,7 +942,7 @@ class static_multimap::device_view_ running = false; if constexpr (is_outer) { if ((not found_match) && (cg_lane_id == 0)) { - auto output_idx = atomicAdd(flushing_cg_counter, 1); + auto const output_idx = atomicAdd(flushing_cg_counter, 1); Key key = k; output_buffer[output_idx] = cuco::make_pair( std::move(key), std::move(this->get_empty_value_sentinel())); @@ -1015,17 +1017,18 @@ class static_multimap::device_view_ auto const slot_is_empty = detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k)); + auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k)); + auto const exists = g.ballot(equals); + uint32_t output_idx = *cg_counter; - auto const exists = g.ballot(equals); if (exists) { if constexpr (is_outer) { found_match = true; } - auto num_matches = __popc(exists); + auto const num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = __popc(exists & ((1 << lane_id) - 1)); - Key key = k; + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); + Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(slot_contents.second)); } @@ -1055,6 +1058,216 @@ class static_multimap::device_view_ } // while running } + /** + * @brief Retrieves all the matches of a given pair using vector loads. + * + * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores + * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = + * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where + * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, + * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and + * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. + * + * @tparam is_outer Boolean flag indicating whether outer join is peformed + * @tparam uses_vector_load Boolean flag indicating whether vector loads are used + * @tparam ProbingCG Type of Cooperative Group used to retrieve + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Key` type. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. + * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve + * @param pair The pair to search for + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ std::enable_if_t pair_retrieve( + ProbingCG const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept + { + auto const lane_id = probing_cg.thread_rank(); + auto current_slot = initial_slot(probing_cg, pair.first); + [[maybe_unused]] auto found_match = false; + + auto num_matches = 0; + + while (true) { + value_type arr[2]; + load_pair_array(&arr[0], current_slot); + + auto const first_slot_is_empty = + detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); + auto const second_slot_is_empty = + detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); + auto const first_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); + auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); + auto const first_exists = probing_cg.ballot(first_equals); + auto const second_exists = probing_cg.ballot(second_equals); + + if (first_exists or second_exists) { + if constexpr (is_outer) { found_match = true; } + + auto const num_first_matches = __popc(first_exists); + + if (first_equals) { + auto lane_offset = detail::count_least_significant_bits(first_exists, lane_id); + auto const output_idx = num_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = arr[0].first; + *(contained_val_begin + output_idx) = arr[0].second; + } + if (second_equals) { + auto const lane_offset = detail::count_least_significant_bits(second_exists, lane_id); + auto const output_idx = num_matches + num_first_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = arr[1].first; + *(contained_val_begin + output_idx) = arr[1].second; + } + num_matches += (num_first_matches + __popc(second_exists)); + } + if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { + if constexpr (is_outer) { + if ((not found_match) and lane_id == 0) { + *(probe_key_begin) = pair.first; + *(probe_val_begin) = pair.second; + *(contained_key_begin) = this->get_empty_key_sentinel(); + *(contained_val_begin) = this->get_empty_value_sentinel(); + } + } + return; // exit if any slot in the current window is empty + } + + current_slot = next_slot(current_slot); + } // while + } + + /** + * @brief Retrieves all the matches of a given pair using scalar loads. + * + * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores + * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = + * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where + * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, + * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and + * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. + * + * @tparam is_outer Boolean flag indicating whether outer join is peformed + * @tparam uses_vector_load Boolean flag indicating whether vector loads are used + * @tparam ProbingCG Type of Cooperative Group used to retrieve + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Key` type. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. + * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve + * @param pair The pair to search for + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ std::enable_if_t pair_retrieve( + ProbingCG const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept + { + auto const lane_id = probing_cg.thread_rank(); + auto current_slot = initial_slot(probing_cg, pair.first); + [[maybe_unused]] auto found_match = false; + + auto num_matches = 0; + + while (true) { + // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation + // is unsafe! + static_assert(sizeof(Key) == sizeof(cuda::atomic)); + static_assert(sizeof(Value) == sizeof(cuda::atomic)); + value_type slot_contents = *reinterpret_cast(current_slot); + + auto const slot_is_empty = + detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); + auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const exists = probing_cg.ballot(equals); + + if (exists) { + if constexpr (is_outer) { found_match = true; } + + if (equals) { + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); + auto const output_idx = num_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = slot_contents.first; + *(contained_val_begin + output_idx) = slot_contents.second; + } + num_matches += __popc(exists); + } + if (probing_cg.any(slot_is_empty)) { + if constexpr (is_outer) { + if ((not found_match) and lane_id == 0) { + *(probe_key_begin) = pair.first; + *(probe_val_begin) = pair.second; + *(contained_key_begin) = this->get_empty_key_sentinel(); + *(contained_val_begin) = this->get_empty_value_sentinel(); + } + } + return; // exit if any slot in the current window is empty + } + + current_slot = next_slot(current_slot); + } // while + } + /** * @brief Retrieves all the matches of a given pair contained in multimap using vector * loads with per-flushing-CG shared memory buffer. @@ -1131,8 +1344,8 @@ class static_multimap::device_view_ if (first_exists or second_exists) { if constexpr (is_outer) { found_match = true; } - auto num_first_matches = __popc(first_exists); - auto num_second_matches = __popc(second_exists); + auto const num_first_matches = __popc(first_exists); + auto const num_second_matches = __popc(second_exists); uint32_t output_idx; if (0 == cg_lane_id) { @@ -1141,12 +1354,13 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << cg_lane_id) - 1)); + auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = arr[0]; } if (second_equals) { - auto lane_offset = __popc(second_exists & ((1 << cg_lane_id) - 1)); + auto const lane_offset = + detail::count_least_significant_bits(second_exists, cg_lane_id); probe_output_buffer[output_idx + num_first_matches + lane_offset] = pair; contained_output_buffer[output_idx + num_first_matches + lane_offset] = arr[1]; } @@ -1155,7 +1369,7 @@ class static_multimap::device_view_ running = false; if constexpr (is_outer) { if ((not found_match) && (cg_lane_id == 0)) { - auto output_idx = atomicAdd(flushing_cg_counter, 1); + auto const output_idx = atomicAdd(flushing_cg_counter, 1); probe_output_buffer[output_idx] = pair; contained_output_buffer[output_idx] = cuco::make_pair(std::move(this->get_empty_key_sentinel()), @@ -1246,16 +1460,17 @@ class static_multimap::device_view_ auto const slot_is_empty = detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const exists = g.ballot(equals); + uint32_t output_idx = *cg_counter; - auto const exists = g.ballot(equals); if (exists) { if constexpr (is_outer) { found_match = true; } - auto num_matches = __popc(exists); + auto const num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = __popc(exists & ((1 << lane_id) - 1)); + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = slot_contents; } diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 0e8123f53..f1f325e8c 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -18,7 +18,7 @@ #include #include -#include +#include namespace { /** @@ -686,6 +686,36 @@ static_multimap::device_view::retri } } +template +template +__device__ __forceinline__ void +static_multimap::device_view::pair_retrieve( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept +{ + constexpr bool is_outer = false; + impl_.pair_retrieve(probing_cg, + pair, + probe_key_begin, + probe_val_begin, + contained_key_begin, + contained_val_begin, + pair_equal); +} + template ::device_view::pair_ } } +template +template +__device__ __forceinline__ void +static_multimap::device_view::pair_retrieve_outer( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept +{ + constexpr bool is_outer = true; + impl_.pair_retrieve(probing_cg, + pair, + probe_key_begin, + probe_val_begin, + contained_key_begin, + contained_val_begin, + pair_equal); +} + template + __device__ __forceinline__ void pair_retrieve( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept; + /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG * shared memory buffer. @@ -1005,6 +1048,51 @@ class static_multimap { OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept; + /** + * @brief Retrieves all the matches of a given pair + * + * For pair `p` with `n = pair_count_outer(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, + * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an + * unspecified value of `j` where `0 <= j < n`. If `p` does not have any matches, stores + * `probe_key_begin[0] = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = + * empty_key_sentinel`, and `contained_val_begin[0] = empty_value_sentinel`. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. + * + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Key` type. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. + * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve + * @param pair The pair to search for + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ void pair_retrieve_outer( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + PairEqual pair_equal) noexcept; + /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG * shared memory buffer. diff --git a/tests/static_multimap/static_multimap_test.cu b/tests/static_multimap/static_multimap_test.cu index 0140d3223..6a48c7b18 100644 --- a/tests/static_multimap/static_multimap_test.cu +++ b/tests/static_multimap/static_multimap_test.cu @@ -699,3 +699,172 @@ TEMPLATE_TEST_CASE_SIG("Tests of pair functions", test_pair_functions(map, d_pairs.begin(), num_pairs); } } + +template +__global__ void custom_pair_retrieve_outer(InputIt first, + InputIt last, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, + ScanIt scan_begin, + viewT view, + PairEqual pair_equal) +{ + auto g = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * blockIdx.x + threadIdx.x; + auto pair_idx = tid / cg_size; + + while (first + pair_idx < last) { + auto const offset = *(scan_begin + pair_idx); + auto const pair = *(first + pair_idx); + view.pair_retrieve_outer(g, + pair, + probe_key_begin + offset, + probe_val_begin + offset, + contained_key_begin + offset, + contained_val_begin + offset, + pair_equal); + pair_idx += (gridDim.x * block_size) / cg_size; + } +} + +template +void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + thrust::device_vector> d_pairs(num_pairs); + + // pair multiplicity = 2 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i / 2, i}; + }); + + auto pair_begin = d_pairs.begin(); + + map.insert(pair_begin, pair_begin + num_pairs); + + // query pair matching rate = 50% + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + pair_begin, + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + // create an array of prefix sum + thrust::device_vector d_scan(num_pairs); + auto count_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { return i < (num_pairs / 2) ? 2 : 1; }); + thrust::exclusive_scan(thrust::device, count_begin, count_begin + num_pairs, d_scan.begin(), 0); + + auto constexpr gold_size = 300; + auto constexpr block_size = 128; + auto constexpr cg_size = map.cg_size(); + + auto const grid_size = (cg_size * num_pairs + block_size - 1) / block_size; + + auto view = map.get_device_view(); + + auto num = map.pair_count_outer(pair_begin, pair_begin + num_pairs, pair_equal{}); + REQUIRE(num == gold_size); + + thrust::device_vector probe_keys(gold_size); + thrust::device_vector probe_vals(gold_size); + thrust::device_vector contained_keys(gold_size); + thrust::device_vector contained_vals(gold_size); + + custom_pair_retrieve_outer + <<>>(pair_begin, + pair_begin + num_pairs, + probe_keys.begin(), + probe_vals.begin(), + contained_keys.begin(), + contained_vals.begin(), + d_scan.begin(), + view, + pair_equal{}); + + // sort before compare + thrust::sort(thrust::device, probe_keys.begin(), probe_keys.end()); + thrust::sort(thrust::device, probe_vals.begin(), probe_vals.end()); + thrust::sort(thrust::device, contained_keys.begin(), contained_keys.end()); + thrust::sort(thrust::device, contained_vals.begin(), contained_vals.end()); + + // set gold references + auto gold_probe = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs) { return i / 2; } + return i - (int(num_pairs) / 2); + }); + auto gold_contained_key = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs / 2) { return -1; } + return (i - (int(num_pairs) / 2)) / 2; + }); + auto gold_contained_val = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs / 2) { return -1; } + return i - (int(num_pairs) / 2); + }); + + REQUIRE( + thrust::equal(thrust::device, probe_keys.begin(), probe_keys.begin() + gold_size, gold_probe)); + + REQUIRE( + thrust::equal(thrust::device, probe_vals.begin(), probe_vals.begin() + gold_size, gold_probe)); + + REQUIRE(thrust::equal(thrust::device, + contained_keys.begin(), + contained_keys.begin() + gold_size, + gold_contained_key)); + + REQUIRE(thrust::equal(thrust::device, + contained_vals.begin(), + contained_vals.begin() + gold_size, + gold_contained_val)); +} + +TEMPLATE_TEST_CASE_SIG("Tests of non-shared-memory pair_retrieve", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_pairs{200}; + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{num_pairs * 2, -1, -1}; + test_non_shmem_pair_retrieve(map, num_pairs); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{num_pairs * 2, -1, -1}; + test_non_shmem_pair_retrieve(map, num_pairs); + } +}