add comments and run full CI -retrigger

alpaka-group · Jan 28, 2025 · 07f9962 · 07f9962
1 parent bf59c1d
commit 07f9962
Showing 1 changed file with 9 additions and 1 deletion.
diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -380,16 +380,24 @@ void testKernels()
                 bufAccInputBPtr,
                 bufAccOutputCPtr, // this is used here a kind of dummy
                 static_cast<alpaka::Idx<AccType>>(arraySize));
-            auto const maxThreadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
 
+            // Get the maxThreadPerBlock
+            auto const maxThreadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
+            // Threads per block is 1024 for benchmark, if the system does not allow use the max value
             auto threadsPerBlock
                 = maxThreadsPerBlock < blockThreadExtentMain ? maxThreadsPerBlock : blockThreadExtentMain;
 
+            // Reduce operation at dot-kernel needs even block size
             if(threadsPerBlock != 1 && threadsPerBlock % 2 != 0)
             {
                 threadsPerBlock -= 1;
             }
 
+            // Although in this code Dot product kernel is run for multi threaded CPU backends;
+            // this kernel is only used for benchmarking of GPU backends; and Work division is fixed: 256,1024,1.
+            // Hence blocksize should be 1024 for GPU backends. But for multi-threaded CPUs; this code would also run
+            // and in that case it could be different.
+            // https://github.com/UoB-HPC/BabelStream/blob/main/src/cuda/CUDAStream.cu
             auto workDiv = alpaka::WorkDivMembers{
                 Vec::all(static_cast<alpaka::Idx<AccType>>(dotGridBlockExtent)),
                 Vec::all(static_cast<alpaka::Idx<AccType>>(threadsPerBlock)),