diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h index c4025dd42d4a6..231170212a37b 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h +++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h @@ -42,7 +42,7 @@ namespace cms::alpakatools { } } - inline constexpr T cont& back() const { + inline constexpr T const &back() const { if (m_size > 0) { return m_data[m_size - 1]; } else diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h index bcf1da16a5533..0f94ad200efd9 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -9,6 +9,7 @@ #include #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" namespace cms::alpakatools { @@ -26,20 +27,27 @@ namespace cms::alpakatools { alpaka::syncBlockThreads(acc); // find first negative - for(auto idx: elements_with_stride(acc, size - 1)) { - if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + for (auto idx : independent_group_elements(acc, size - 1)) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) { firstNeg = idx + 1; + } } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg, size)) { + ind2[idx - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg)) { + ind2[idx + size - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + } } template @@ -52,51 +60,56 @@ namespace cms::alpakatools { alpaka::syncBlockThreads(acc); // find first negative - for(auto idx: elements_with_stride(acc, size - 1)) { + for (auto idx : independent_group_elements(acc, size - 1)) { if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) firstNeg = idx + 1; } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg, size)) { + ind2[size - idx - 1] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg)) { + ind2[idx + size - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + } } - // Radix sort implements a bytewise lexicographic order on the input data. // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first // and respecting the existing order when binning the values, we achieve the lexicographic ordering. // The number of bytes actually considered is a parameter template parameter. // The post processing reorder - // function fixes the order when bitwise ordering is not the order for the underlying type (only based on + // function fixes the order when bitwise ordering is not the order for the underlying type (only based on // most significant bit for signed types, integer or floating point). // The floating point numbers are reinterpret_cast into integers in the calling wrapper // This algorithm requires to run in a single block template // The post processing reorder function. + typename T, // shall be integer, signed or not does not matter here + int NS, // number of significant bytes to use in sorting. + typename RF> // The post processing reorder function. ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl( const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { if constexpr (!requires_single_thread_per_block_v) { const auto warpSize = alpaka::warp::getSize(acc); const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); // we expect a power of 2 here assert(warpSize && (0 == (warpSize & (warpSize - 1)))); - const std::size_t warpMask = warpSize - 1; + const std::size_t warpMask = warpSize - 1; // Define the bin size (d=8 => 1 byte bin). constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits; // Make sure the slices are data aligned - static_assert (0 == dataBits % binBits); + static_assert(0 == dataBits % binBits); // Make sure the NS parameter makes sense - static_assert (NS > 0 && NS <= sizeof(T)); + static_assert(NS > 0 && NS <= sizeof(T)); constexpr int binsNumber = 1 << binBits; constexpr int binsMask = binsNumber - 1; // Prefix scan iterations. NS is counted in full bytes and not slices. @@ -106,7 +119,7 @@ namespace cms::alpakatools { // TODO: rename auto& c = alpaka::declareSharedVar(acc); // Temporary storage for prefix scan. Only really needed for first-of-warp keeping - // Then used for thread to bin mapping TODO: change type to byte and remap to + // Then used for thread to bin mapping TODO: change type to byte and remap to auto& ct = alpaka::declareSharedVar(acc); // Bin to thread index mapping (used to store the highest thread index within a bin number // batch of threads. @@ -114,7 +127,7 @@ namespace cms::alpakatools { // lowest possible value (change to bytes?) auto& cu = alpaka::declareSharedVar(acc); // TODO we could also have an explicit caching of the current index for each thread. - + // TODO: do those have to be shared? auto& ibs = alpaka::declareSharedVar(acc); auto& currentSortingPass = alpaka::declareSharedVar(acc); @@ -129,36 +142,40 @@ namespace cms::alpakatools { auto k = ind2; // Initializer index order to trivial increment. - for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; } + for (auto idx : independent_group_elements(acc, size)) { + j[idx] = idx; + } alpaka::syncBlockThreads(acc); // Iterate on the slices of the data. while (alpaka::syncBlockThreadsPredicate(acc, (currentSortingPass < totalSortingPassses))) { - for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; } + for (auto idx : independent_group_elements(acc, binsNumber)) { + c[idx] = 0; + } alpaka::syncBlockThreads(acc); const auto sortingPassShift = binBits * currentSortingPass; // fill bins (count elements in each bin) - for (auto idx: independent_group_elements(acc, size)) { + for (auto idx : independent_group_elements(acc, size)) { auto bin = (a[j[idx]] >> sortingPassShift) & binsMask; alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); - } + } alpaka::syncBlockThreads(acc); - - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d ", currentSortingPass - 1); + + if (!threadIdxLocal && 1 == alpaka::getIdx(acc)[0]) { + // printf("Pass=%d, Block=%d, ", currentSortingPass - 1, alpaka::getIdx(acc)[0]); size_t total = 0; - for(int i=0; i<(int)binsNumber; i++) { - printf("count[%d]=%d ", i, c[i] ); + for (int i = 0; i < (int)binsNumber; i++) { + // printf("count[%d]=%d ", i, c[i] ); total += c[i]; } - printf("total=%zu\n", total); + // printf("total=%zu\n", total); assert(total == size); } // prefix scan "optimized"???... // TODO: we might be able to reuse the warpPrefixScan function // Warp level prefix scan - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { auto x = c[idx]; auto laneId = idx & warpMask; @@ -170,16 +187,16 @@ namespace cms::alpakatools { ct[idx] = x; } alpaka::syncBlockThreads(acc); - + // Block level completion of prefix scan (add last sum of each preceding warp) - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { auto ss = (idx / warpSize) * warpSize - 1; c[idx] = ct[idx]; for (int i = ss; i > 0; i -= warpSize) c[idx] += ct[i]; } // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin - + /* //prefix scan for the nulls (for documentation) if (threadIdxLocal==0) @@ -191,18 +208,11 @@ namespace cms::alpakatools { // This will reorder the indices by the currently considered slice, otherwise preserving the previous order. ibs = size - 1; alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d (post prefix scan) ", currentSortingPass - 1); - for(int i=0; i<(int)binsNumber; i++) { - printf("offset[%d]=%d ", i, c[i] ); - } - printf("\n"); - } // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations while (alpaka::syncBlockThreadsPredicate(acc, ibs >= 0)) { // Init - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { cu[idx] = -1; ct[idx] = -1; } @@ -210,7 +220,7 @@ namespace cms::alpakatools { // Find the highest index for all the threads dealing with a given bin (in cu[]) // Also record the bin for each thread (in ct[]) - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { int i = ibs - idx; int32_t bin = -1; if (i >= 0) { @@ -220,24 +230,16 @@ namespace cms::alpakatools { } } alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d (max index) ", currentSortingPass - 1); - for(int i=0; i<(int)binsNumber; i++) { - printf("max_i[%d]=%d ", i, cu[i] ); - } - printf("\n"); - } - // FIXME: we can slash a memory access. - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { int i = ibs - idx; // Are we still in inside the data? if (i >= 0) { int32_t bin = ct[idx]; // Are we the thread with the highest index (from previous pass)? if (cu[bin] == i) { - // With the highest index, we are actually the lowest thread number. We will + // With the highest index, we are actually the lowest thread number. We will // work "on behalf of" the higher thread numbers (including ourselves) // No way around scanning and testing for bin in ct[otherThread] number to find the other threads for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) { @@ -292,21 +294,17 @@ namespace cms::alpakatools { if (threadIdxLocal == 0) ++currentSortingPass; alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && size == 257) { - printf("Pass=%d ", currentSortingPass - 1); - for(int i=0; i<(int)size; i++) { - printf("k[%d]=%d ", i, k[i] ); - } - printf("\n"); - } } if ((dataBits != 8) && (0 == (NS & 1))) - ALPAKA_ASSERT_OFFLOAD(j == ind); // dataBits/binBits is even so ind is correct (the result is in the right location) + ALPAKA_ASSERT_OFFLOAD(j == + ind); // dataBits/binBits is even so ind is correct (the result is in the right location) // TODO this copy is (doubly?) redundant with the reorder if (j != ind) // odd number of sorting passes, we need to move the result to the right array (ind[]) - for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; }; + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + }; alpaka::syncBlockThreads(acc); @@ -321,41 +319,36 @@ namespace cms::alpakatools { template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && !requires_single_thread_per_block_v, T>::type* = + nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, a, ind, ind2, size, dummyReorder); } template ::value && std::is_signed::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && std::is_signed::value && + !requires_single_thread_per_block_v, + T>::type* = nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, a, ind, ind2, size, reorderSigned); } template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && !requires_single_thread_per_block_v, + T>::type* = nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size"); using I = int; - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); } - + template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc, - T const* v, - uint16_t* index, - uint32_t const* offsets, - uint16_t* workspace) { - // TODO: check + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti( + const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + // TODO: check // Sort multiple blocks of data in v[] separated by in chunks located at offsets[] // extern __shared__ uint16_t ws[]; - uint16_t * ws = alpaka::getDynSharedMem(acc); + uint16_t* ws = alpaka::getDynSharedMem(acc); const uint32_t blockIdx(alpaka::getIdx(acc)[0u]); auto a = v + offsets[blockIdx]; @@ -402,23 +392,24 @@ namespace cms::alpakatools { auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx]; auto size = offsets[blockIdx + 1] - offsets[blockIdx]; assert(offsets[blockIdx + 1] >= offsets[blockIdx]); - if (0 == alpaka::getIdx(acc)[0]) { - printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", - blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]); - } if (size > 0) radixSort(acc, a, ind, ind2, size); } - + template struct radixSortMultiWrapper { -/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical + /* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical (keeping CUDA/HIP code for reference for the moment) #if defined(__CUDACC__) || defined(__HIPCC__) //__global__ void __launch_bounds__(256, 4) #endif */ template - ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + ALPAKA_FN_ACC void operator()(const TAcc& acc, + T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace, + size_t sharedMemBytes = 0) const { radixSortMulti(acc, v, index, offsets, workspace); } }; @@ -426,10 +417,39 @@ namespace cms::alpakatools { template struct radixSortMultiWrapper2 { template - ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + ALPAKA_FN_ACC void operator()(const TAcc& acc, + T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace, + size_t sharedMemBytes = 0) const { radixSortMulti(acc, v, index, offsets, workspace); } }; } // namespace cms::alpakatools +namespace alpaka::trait { + // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of + // block shared dynamic memory for the radixSortMultiWrapper kernel + template + struct BlockSharedMemDynSizeBytes, TAcc> { + // the size in bytes of the shared memory allocated for a block + ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes( + cms::alpakatools::radixSortMultiWrapper const& /* kernel */, + alpaka_common::Vec1D /* threads */, + alpaka_common::Vec1D /* elements */, + T const* /* v */, + uint16_t* /* index */, + uint32_t const* /* offsets */, + uint16_t* workspace, + size_t sharedMemBytes) { + if (workspace != nullptr) + return 0; + /* The shared memory workspace is 'blockspace * 2' in CUDA *but that's a value coincidence... TODO: check */ + //printf ("in BlockSharedMemDynSizeBytes, TAcc>: shared mem size = %d\n", (int)sharedMemBytes); + return sharedMemBytes; + } + }; +} // namespace alpaka::trait + #endif // HeterogeneousCore_AlpakaInterface_interface_radixSort_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 4c0aa9fe5f2b9..e02f4e92f813e 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -701,6 +701,12 @@ namespace cms::alpakatools { stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{extent} {} + ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + thread_{alpaka::getIdx(acc)[0u] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + class const_iterator; using iterator = const_iterator; diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 463bf5dae4461..0198e36f9166f 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -58,7 +58,6 @@ - diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc index f4073997f5c97..1687feb8c1bab 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc @@ -42,7 +42,12 @@ struct finalize { TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) { SECTION("AtomicPairCounter") { auto const &devices = cms::alpakatools::devices(); - REQUIRE(!devices.empty()); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + REQUIRE(not devices.empty()); + } + // run the test on each device for (auto const &device : devices) { std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n'; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc index 5e08c05725888..4ce11cc7facdd 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc @@ -93,7 +93,7 @@ struct mykernel { #ifndef NDEBUG auto b0 = Hist::bin(v[j]); #endif - int tot = 0; + [[maybe_unused]] int tot = 0; auto ftest = [&](unsigned int k) { ALPAKA_ASSERT_OFFLOAD(k < N); ++tot; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc index c781f83ff1225..21f3477a3dd92 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc @@ -64,7 +64,8 @@ ALPAKA_FN_HOST_ACC T truncate(T const& t) { namespace { struct testKernel { template - ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const { + ALPAKA_FN_ACC void operator()( + const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const { //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the track index //size_t gridSize = blockDim.x * gridDim.x; bool threadZero = !alpaka::getIdx(acc)[0u]; @@ -76,15 +77,15 @@ namespace { assert(0 == blocksIdx); assert(elements <= 2048); - auto &order = alpaka::declareSharedVar(acc); - auto &sws = alpaka::declareSharedVar(acc); - auto &z = alpaka::declareSharedVar(acc); - auto &iz = alpaka::declareSharedVar(acc); -// __shared__ uint16_t order[2048]; -// __shared__ uint16_t sws[2048]; -// __shared__ float z[2048]; -// __shared__ int iz[2048]; - for (auto itrack: elements_with_stride(acc, elements)) { + auto& order = alpaka::declareSharedVar(acc); + auto& sws = alpaka::declareSharedVar(acc); + auto& z = alpaka::declareSharedVar(acc); + auto& iz = alpaka::declareSharedVar(acc); + // __shared__ uint16_t order[2048]; + // __shared__ uint16_t sws[2048]; + // __shared__ float z[2048]; + // __shared__ int iz[2048]; + for (auto itrack : elements_with_stride(acc, elements)) { z[itrack] = gpu_input[itrack]; iz[itrack] = 10000 * gpu_input[itrack]; // order[itrack] = itrack; @@ -94,7 +95,7 @@ namespace { alpaka::syncBlockThreads(acc); //verify - for (auto itrack: elements_with_stride(acc, elements - 1)) { + for (auto itrack : elements_with_stride(acc, elements - 1)) { auto ntrack = order[itrack]; auto mtrack = order[itrack + 1]; assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack])); @@ -122,7 +123,7 @@ namespace { radixSort(acc, iz, order, sws, elements); alpaka::syncBlockThreads(acc); - for (auto itrack: elements_with_stride(acc, elements - 1)) { + for (auto itrack : elements_with_stride(acc, elements - 1)) { auto ntrack = order[itrack]; auto mtrack = order[itrack + 1]; assert(iz[ntrack] <= iz[mtrack]); @@ -146,11 +147,12 @@ namespace { } }; - void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) { + void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) { auto blockSize = 512; // somewhat arbitrary auto gridSize = 1; // round up to cover the sample size const auto workdiv = make_workdiv(gridSize, blockSize); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint)); + alpaka::enqueue(queue, + alpaka::createTaskKernel(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint)); alpaka::wait(queue); } } // namespace @@ -165,50 +167,54 @@ int main() { << ", the test will be skipped.\n"; return 0; } - + // run the test on each device for (auto const& device : devices) { Queue queue(device); -// FLOAT* gpu_input; -// int* gpu_product; + // FLOAT* gpu_input; + // int* gpu_product; int nmax = 4 * 260; auto gpu_input_h = cms::alpakatools::make_host_buffer(queue, nmax); auto i = gpu_input_h.data(); - for (auto v: { - 30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621, - 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067, - 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354, - 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, - 2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, - 2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, - 3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, - 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093, - 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182, - 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988, - 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, - 3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, - 3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, - 3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, - 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031, - 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632, - 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052, - 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, - 3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, - 3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, - 3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, - 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972, - 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104, - 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779, - 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, - 4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, - 4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, - 4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, - 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427, - 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188, - 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567, - 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, - 4.7723139628, 4.774812736, 4.8577151299, 4.890037536}) { + for (auto v : {30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, + 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, + 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, + 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, + 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026, + 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411, + 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456, + 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, + 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, + 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, + 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, + 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, + 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759, + 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729, + 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928, + 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, + 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, + 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, + 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, + 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, + 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346, + 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297, + 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066, + 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, + 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, + 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, + 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, + 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, + 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083, + 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332, + 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947, + 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, + 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, + 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, + 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, + 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, + 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7723139628, 4.774812736, 4.8577151299, + 4.890037536}) { *(i++) = v; } auto input = gpu_input_h.data(); @@ -219,7 +225,7 @@ int main() { } auto gpu_input_d = cms::alpakatools::make_device_buffer(queue, nmax); //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax)); -// cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax)); + // cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax)); auto gpu_product_d = cms::alpakatools::make_device_buffer(queue, nmax); // copy the input data to the GPU alpaka::memcpy(queue, gpu_input_d, gpu_input_h); diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc index df41c649b91cb..9caf2af20eb9c 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc @@ -81,7 +81,7 @@ void truncate(T& t) { } template -void go(Queue & queue, bool useShared) { +void go(Queue& queue, bool useShared) { std::mt19937 eng; //std::mt19937 eng2; auto rgen = RS::ud(); @@ -145,46 +145,48 @@ void go(Queue & queue, bool useShared) { alpaka::memcpy(queue, v_d, v_h); alpaka::memcpy(queue, off_d, offsets_h); -// cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); -// cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) std::cout << "launch for " << offsets_h[blocks] << std::endl; - auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; + auto ntXBl = 1 == i % 4 ? 256 : 256; auto start = std::chrono::high_resolution_clock::now(); - // TODO: manage runtime sized shared memory - [[maybe_unused]] constexpr int MaxSize = 256 * 32; + // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory). + constexpr int MaxSize = 256 * 32; auto workdiv = make_workdiv(blocks, ntXBl); if (useShared) // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier - // - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - radixSortMultiWrapper{}, v_d.data(), ind_d.data(), off_d.data(), nullptr)); -// cms::cuda::launch( -// radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + // + alpaka::enqueue(queue, + alpaka::createTaskKernel(workdiv, + radixSortMultiWrapper{}, + v_d.data(), + ind_d.data(), + off_d.data(), + nullptr, + MaxSize * sizeof(uint16_t))); else -// cms::cuda::launch( -// radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - radixSortMultiWrapper2{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workdiv, radixSortMultiWrapper2{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data())); - if (i == 0) - std::cout << "done for " << offsets_h[blocks] << std::endl; + if (i < 2) + std::cout << "launch done for " << offsets_h[blocks] << std::endl; alpaka::memcpy(queue, ind_h, ind_d); alpaka::wait(queue); - //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); delta += std::chrono::high_resolution_clock::now() - start; - if (i == 0) - std::cout << "done for " << offsets_h[blocks] << std::endl; + if (i < 2) + std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl; if (32 == i) { std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl; - std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl; + std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) + << std::endl; std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' ' << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl; } @@ -194,17 +196,16 @@ void go(Queue & queue, bool useShared) { inds.insert(ind_h[offsets_h[ib]]); for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) { if (inds.count(ind_h[j]) != 0) { - printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", - i, ib, j, ind_h[j]); + printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]); std::vector counts; counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0); for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) { counts[ind_h[j2]]++; - } + } for (size_t j2 = 0; j2 < counts.size(); j2++) { - if (counts[j2]!=1) + if (counts[j2] != 1) printf("counts[%ld]=%d ", j2, counts[j2]); - } + } printf("\n"); printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j])); } @@ -216,9 +217,12 @@ void go(Queue & queue, bool useShared) { truncate(k1); truncate(k2); if (k1 < k2) { - std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " << offsets_h[ib + 1] - 1 - << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1 - << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl; + std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " + << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j] + << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1 + << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl; + //sleep(2); + assert(false); } } if (!inds.empty()) { @@ -228,7 +232,7 @@ void go(Queue & queue, bool useShared) { if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib])) std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib]) << std::endl; - // + // assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib])); } } // 50 times diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc index f4f4719192c12..ce85ad42cb0f4 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc @@ -10,14 +10,14 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE; // Kernel running a loop over threads/elements // One function with multiple flavors -enum class RangeType { - Default, ExtentLimited, ExtentLimitedWithShift -}; - -enum class LoopScope { - Block, Grid -}; +// The type of elements_with_stride +enum class RangeType { Default, ExtentLimited, ExtentLimitedWithShift }; + +// The concurrency scope between threads +enum class LoopScope { Block, Grid }; + +// Utility for one time initializations template bool constexpr firstInLoopRange(TAcc const& acc) { if constexpr (loopScope == LoopScope::Block) @@ -30,7 +30,7 @@ bool constexpr firstInLoopRange(TAcc const& acc) { template size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) { if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - return shift < size ? size - shift: 0; + return shift < size ? size - shift : 0; else if constexpr (rangeType == RangeType::ExtentLimited) return size; else /* rangeType == RangeType::Default */ @@ -41,9 +41,9 @@ size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) { } template -size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) { +size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shift) { if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - return shift < size ? size - shift: 0; + return shift < size ? size - shift : 0; else if constexpr (rangeType == RangeType::ExtentLimited) return size; else /* rangeType == RangeType::Default */ @@ -53,12 +53,13 @@ size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shi return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u]; } -template +template struct testWordDivisionDefaultRange { template - ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const { - size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar(acc)); - // Init the counter for block range. Grid range does so my mean of memset. + ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t shift, size_t* globalCounter) const { + size_t& counter = + (loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar(acc)); + // Init the counter for block range. Grid range does so my mean of memset. if constexpr (loopScope == LoopScope::Block) { if (firstInLoopRange(acc)) counter = 0; @@ -66,13 +67,13 @@ struct testWordDivisionDefaultRange { } // The loop we are testing if constexpr (rangeType == RangeType::Default) - for ([[maybe_unused]] auto idx: elements_with_stride(acc)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); else if constexpr (rangeType == RangeType::ExtentLimited) - for ([[maybe_unused]] auto idx: elements_with_stride(acc, size)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc, size)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); else if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - for ([[maybe_unused]]auto idx: elements_with_stride(acc, size, shift)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc, shift, size)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); alpaka::syncBlockThreads(acc); // Check the result. Grid range will check by memcpy-ing the result. @@ -103,44 +104,87 @@ int main() { ssize_t BlockSize = 512; size_t GridSize = 4; for (size_t blocks = 1; blocks < GridSize * 3; blocks++) - for (auto sizeFuzz: std::initializer_list{ - BlockSize / 2, -1, 0, 1, BlockSize/2 }) - for (auto shift: std::initializer_list{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) { - // Grid level iteration: we need to initialize/check at the grid level + for (auto sizeFuzz : + std::initializer_list{-10 * BlockSize / 13, -BlockSize / 2, -1, 0, 1, BlockSize / 2}) + for (auto shift : std::initializer_list{0, + 1, + BlockSize / 2, + BlockSize - 1, + BlockSize, + BlockSize + 1, + BlockSize + BlockSize / 2, + 2 * BlockSize - 1, + 2 * BlockSize, + 2 * BlockSize + 1}) { + // Grid level iteration: we need to initialize/check at the grid level // Default range alpaka::memset(queue, counter_d, 0); auto workdiv = make_workdiv(BlockSize, GridSize); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - auto expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + auto expected = + expectedCount(workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // ExtentLimited range alpaka::memset(queue, counter_d, 0); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + expected = + expectedCount(workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // ExtentLimitedWithShift range alpaka::memset(queue, counter_d, 0); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + expected = expectedCount( + workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // Block level auto tests - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); } alpaka::wait(queue); } diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 416e647c349a4..ee8048431ab6d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl( if (threadIdx.x == 0) ++p; __syncthreads(); - } + } if ((w != 8) && (0 == (NS & 1))) assert(j == ind); // w/d is even so ind is correct @@ -261,7 +261,7 @@ namespace cms { template __global__ void __launch_bounds__(256, 4) - radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { radixSortMulti(v, index, offsets, workspace); } diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu index 6b3ee2e4afa9c..8b6ffd70fd4e6 100644 --- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu @@ -74,7 +74,7 @@ namespace { __syncthreads(); radixSort(z, order, sws, elements); __syncthreads(); - + //verify for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) { auto ntrack = order[itrack]; diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index 209ce97347e25..a15bed0ae8a1f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -151,10 +151,15 @@ void go(bool useShared) { auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; delta -= (std::chrono::high_resolution_clock::now() - start); + // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory). constexpr int MaxSize = 256 * 32; if (useShared) - cms::cuda::launch( - radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + cms::cuda::launch(radixSortMultiWrapper, + {blocks, ntXBl, MaxSize * 2 /* sizeof(uint16_t) */}, + v_d.get(), + ind_d.get(), + off_d.get(), + nullptr); else cms::cuda::launch( radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());