From 43ce20eee979ffc8b41d38629e91605d7cce3c54 Mon Sep 17 00:00:00 2001 From: Slava Krutelyov Date: Mon, 12 Aug 2024 15:26:08 -0700 Subject: [PATCH] explicitly require 1D single block kernels to use Acc1D and have one block with asserts --- RecoTracker/LSTCore/src/alpaka/Event.dev.cc | 77 ++++++------------- RecoTracker/LSTCore/src/alpaka/MiniDoublet.h | 14 +++- RecoTracker/LSTCore/src/alpaka/Quintuplet.h | 16 +++- RecoTracker/LSTCore/src/alpaka/Segment.h | 16 +++- .../LSTCore/src/alpaka/TrackCandidate.h | 16 +++- RecoTracker/LSTCore/src/alpaka/Triplet.h | 16 +++- 6 files changed, 81 insertions(+), 74 deletions(-) diff --git a/RecoTracker/LSTCore/src/alpaka/Event.dev.cc b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc index 9e46c96a4488c..cc8872438dfe7 100644 --- a/RecoTracker/LSTCore/src/alpaka/Event.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc @@ -255,13 +255,10 @@ void lst::Event::addPixelSegmentToEvent(std::vector const& alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - Vec3D const threadsPerBlockCreateMD{1, 1, 1024}; - Vec3D const blocksPerGridCreateMD{1, 1, 1}; - WorkDiv3D const createMDArrayRangesGPU_workDiv = - createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; - alpaka::exec( + alpaka::exec( queue, createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_.data(), *rangesInGPU); auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); @@ -281,13 +278,10 @@ void lst::Event::addPixelSegmentToEvent(std::vector const& // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously. // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them - Vec3D const threadsPerBlockCreateSeg{1, 1, 1024}; - Vec3D const blocksPerGridCreateSeg{1, 1, 1}; - WorkDiv3D const createSegmentArrayRanges_workDiv = - createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); + WorkDiv1D const createSegmentArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); lst::createSegmentArrayRanges createSegmentArrayRanges_kernel; - alpaka::exec(queue, + alpaka::exec(queue, createSegmentArrayRanges_workDiv, createSegmentArrayRanges_kernel, *modulesBuffers_.data(), @@ -388,13 +382,10 @@ void lst::Event::createMiniDoublets() { alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - Vec3D const threadsPerBlockCreateMD{1, 1, 1024}; - Vec3D const blocksPerGridCreateMD{1, 1, 1}; - WorkDiv3D const createMDArrayRangesGPU_workDiv = - createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; - alpaka::exec( + alpaka::exec( queue, createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_.data(), *rangesInGPU); auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); @@ -424,13 +415,10 @@ void lst::Event::createMiniDoublets() { *mdsInGPU, *rangesInGPU); - Vec3D const threadsPerBlockAddMD{1, 1, 1024}; - Vec3D const blocksPerGridAddMD{1, 1, 1}; - WorkDiv3D const addMiniDoubletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddMD, threadsPerBlockAddMD, elementsPerThread); + WorkDiv1D const addMiniDoubletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); lst::addMiniDoubletRangesToEventExplicit addMiniDoubletRangesToEventExplicit_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addMiniDoubletRangesToEventExplicit_workDiv, addMiniDoubletRangesToEventExplicit_kernel, *modulesBuffers_.data(), @@ -465,13 +453,10 @@ void lst::Event::createSegmentsWithModuleMap() { *segmentsInGPU, *rangesInGPU); - Vec3D const threadsPerBlockAddSeg{1, 1, 1024}; - Vec3D const blocksPerGridAddSeg{1, 1, 1}; - WorkDiv3D const addSegmentRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddSeg, threadsPerBlockAddSeg, elementsPerThread); + WorkDiv1D const addSegmentRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); lst::addSegmentRangesToEventExplicit addSegmentRangesToEventExplicit_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addSegmentRangesToEventExplicit_workDiv, addSegmentRangesToEventExplicit_kernel, *modulesBuffers_.data(), @@ -485,13 +470,10 @@ void lst::Event::createSegmentsWithModuleMap() { void lst::Event::createTriplets() { if (tripletsInGPU == nullptr) { - Vec3D const threadsPerBlockCreateTrip{1, 1, 1024}; - Vec3D const blocksPerGridCreateTrip{1, 1, 1}; - WorkDiv3D const createTripletArrayRanges_workDiv = - createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); + WorkDiv1D const createTripletArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); lst::createTripletArrayRanges createTripletArrayRanges_kernel; - alpaka::exec(queue, + alpaka::exec(queue, createTripletArrayRanges_workDiv, createTripletArrayRanges_kernel, *modulesBuffers_.data(), @@ -563,13 +545,10 @@ void lst::Event::createTriplets() { index_gpu_buf.data(), nonZeroModules); - Vec3D const threadsPerBlockAddTrip{1, 1, 1024}; - Vec3D const blocksPerGridAddTrip{1, 1, 1}; - WorkDiv3D const addTripletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddTrip, threadsPerBlockAddTrip, elementsPerThread); + WorkDiv1D const addTripletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); lst::addTripletRangesToEventExplicit addTripletRangesToEventExplicit_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addTripletRangesToEventExplicit_workDiv, addTripletRangesToEventExplicit_kernel, *modulesBuffers_.data(), @@ -604,13 +583,10 @@ void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_ *segmentsInGPU, *pixelQuintupletsInGPU); - Vec3D const threadsPerBlock_addpT3asTrackCandidatesInGPU{1, 1, 512}; - Vec3D const blocksPerGrid_addpT3asTrackCandidatesInGPU{1, 1, 1}; - WorkDiv3D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv( - blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread); + WorkDiv1D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv({1}, {512}, {1}); lst::addpT3asTrackCandidatesInGPU addpT3asTrackCandidatesInGPU_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addpT3asTrackCandidatesInGPU_workDiv, addpT3asTrackCandidatesInGPU_kernel, nLowerModules_, @@ -849,13 +825,10 @@ void lst::Event::createPixelTriplets() { } void lst::Event::createQuintuplets() { - Vec3D const threadsPerBlockCreateQuints{1, 1, 1024}; - Vec3D const blocksPerGridCreateQuints{1, 1, 1}; - WorkDiv3D const createEligibleModulesListForQuintupletsGPU_workDiv = - createWorkDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread); + WorkDiv1D const createEligibleModulesListForQuintupletsGPU_workDiv = createWorkDiv({1}, {1024}, {1}); lst::createEligibleModulesListForQuintupletsGPU createEligibleModulesListForQuintupletsGPU_kernel; - alpaka::exec(queue, + alpaka::exec(queue, createEligibleModulesListForQuintupletsGPU_workDiv, createEligibleModulesListForQuintupletsGPU_kernel, *modulesBuffers_.data(), @@ -910,13 +883,10 @@ void lst::Event::createQuintuplets() { *quintupletsInGPU, *rangesInGPU); - Vec3D const threadsPerBlockAddQuint{1, 1, 1024}; - Vec3D const blocksPerGridAddQuint{1, 1, 1}; - WorkDiv3D const addQuintupletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddQuint, threadsPerBlockAddQuint, elementsPerThread); + WorkDiv1D const addQuintupletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); lst::addQuintupletRangesToEventExplicit addQuintupletRangesToEventExplicit_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addQuintupletRangesToEventExplicit_workDiv, addQuintupletRangesToEventExplicit_kernel, *modulesBuffers_.data(), @@ -1044,13 +1014,10 @@ void lst::Event::createPixelQuintuplets() { removeDupPixelQuintupletsInGPUFromMap_kernel, *pixelQuintupletsInGPU); - Vec3D const threadsPerBlockAddpT5asTrackCan{1, 1, 256}; - Vec3D const blocksPerGridAddpT5asTrackCan{1, 1, 1}; - WorkDiv3D const addpT5asTrackCandidateInGPU_workDiv = - createWorkDiv(blocksPerGridAddpT5asTrackCan, threadsPerBlockAddpT5asTrackCan, elementsPerThread); + WorkDiv1D const addpT5asTrackCandidateInGPU_workDiv = createWorkDiv({1}, {256}, {1}); lst::addpT5asTrackCandidateInGPU addpT5asTrackCandidateInGPU_kernel; - alpaka::exec(queue, + alpaka::exec(queue, addpT5asTrackCandidateInGPU_workDiv, addpT5asTrackCandidateInGPU_kernel, nLowerModules_, diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h index b4cbd500c7bf8..c00015384b77b 100644 --- a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h +++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h @@ -968,6 +968,10 @@ namespace lst { ALPAKA_FN_ACC void operator()(TAcc const& acc, struct lst::Modules modulesInGPU, struct lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -978,10 +982,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { short module_rings = modulesInGPU.rings[i]; short module_layers = modulesInGPU.layers[i]; short module_subdets = modulesInGPU.subdets[i]; @@ -1062,10 +1066,14 @@ namespace lst { struct lst::MiniDoublets mdsInGPU, struct lst::ObjectRanges rangesInGPU, struct lst::Hits hitsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (mdsInGPU.nMDs[i] == 0 or hitsInGPU.hitRanges[i * 2] == -1) { rangesInGPU.mdRanges[i * 2] = -1; rangesInGPU.mdRanges[i * 2 + 1] = -1; diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h index 49eb3b1902c9a..07b5f50dd57de 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -2669,6 +2669,10 @@ namespace lst { lst::Modules modulesInGPU, lst::Triplets tripletsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -2681,10 +2685,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (int i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (int i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { // Condition for a quintuple to exist for a module // TCs don't exist for layers 5 and 6 barrel, and layers 2,3,4,5 endcap short module_rings = modulesInGPU.rings[i]; @@ -2756,7 +2760,7 @@ namespace lst { // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { *rangesInGPU.nEligibleT5Modules = static_cast(nEligibleT5Modulesx); *rangesInGPU.device_nTotalQuints = static_cast(nTotalQuintupletsx); } @@ -2769,10 +2773,14 @@ namespace lst { lst::Modules modulesInGPU, lst::Quintuplets quintupletsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (quintupletsInGPU.nQuintuplets[i] == 0 or rangesInGPU.quintupletModuleIndices[i] == -1) { rangesInGPU.quintupletRanges[i * 2] = -1; rangesInGPU.quintupletRanges[i * 2 + 1] = -1; diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h index cee59e316064a..cc8470f911a8b 100644 --- a/RecoTracker/LSTCore/src/alpaka/Segment.h +++ b/RecoTracker/LSTCore/src/alpaka/Segment.h @@ -801,6 +801,10 @@ namespace lst { lst::Modules modulesInGPU, lst::ObjectRanges rangesInGPU, lst::MiniDoublets mdsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -811,10 +815,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (modulesInGPU.nConnectedModules[i] == 0) { rangesInGPU.segmentModuleIndices[i] = nTotalSegments; rangesInGPU.segmentModuleOccupancy[i] = 0; @@ -888,7 +892,7 @@ namespace lst { // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { rangesInGPU.segmentModuleIndices[*modulesInGPU.nLowerModules] = nTotalSegments; *rangesInGPU.device_nTotalSegs = nTotalSegments; } @@ -901,10 +905,14 @@ namespace lst { lst::Modules modulesInGPU, lst::Segments segmentsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (segmentsInGPU.nSegments[i] == 0) { rangesInGPU.segmentRanges[i * 2] = -1; rangesInGPU.segmentRanges[i * 2 + 1] = -1; diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h index 03e853cea7d7b..24ef4b94de0f2 100644 --- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -389,13 +389,17 @@ namespace lst { lst::TrackCandidates trackCandidatesInGPU, lst::Segments segmentsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; - for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets; - pixelTripletIndex += gridThreadExtent[2]) { + for (unsigned int pixelTripletIndex = globalThreadIdx[0]; pixelTripletIndex < nPixelTriplets; + pixelTripletIndex += gridThreadExtent[0]) { if ((pixelTripletsInGPU.isDup[pixelTripletIndex])) continue; @@ -534,13 +538,17 @@ namespace lst { lst::TrackCandidates trackCandidatesInGPU, lst::Segments segmentsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets; unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; - for (int pixelQuintupletIndex = globalThreadIdx[2]; pixelQuintupletIndex < nPixelQuintuplets; - pixelQuintupletIndex += gridThreadExtent[2]) { + for (int pixelQuintupletIndex = globalThreadIdx[0]; pixelQuintupletIndex < nPixelQuintuplets; + pixelQuintupletIndex += gridThreadExtent[0]) { if (pixelQuintupletsInGPU.isDup[pixelQuintupletIndex]) continue; diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h index 3744dfb69e262..9fab052e6531f 100644 --- a/RecoTracker/LSTCore/src/alpaka/Triplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h @@ -928,6 +928,10 @@ namespace lst { lst::Modules modulesInGPU, lst::ObjectRanges rangesInGPU, lst::Segments segmentsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -938,10 +942,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (segmentsInGPU.nSegments[i] == 0) { rangesInGPU.tripletModuleIndices[i] = nTotalTriplets; rangesInGPU.tripletModuleOccupancy[i] = 0; @@ -1015,7 +1019,7 @@ namespace lst { // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { *rangesInGPU.device_nTotalTrips = nTotalTriplets; } } @@ -1027,10 +1031,14 @@ namespace lst { lst::Modules modulesInGPU, lst::Triplets tripletsInGPU, lst::ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (tripletsInGPU.nTriplets[i] == 0) { rangesInGPU.tripletRanges[i * 2] = -1; rangesInGPU.tripletRanges[i * 2 + 1] = -1;