diff --git a/test/unit/warp/src/ShflDown.cpp b/test/unit/warp/src/ShflDown.cpp new file mode 100644 index 000000000000..a774eed12a7b --- /dev/null +++ b/test/unit/warp/src/ShflDown.cpp @@ -0,0 +1,180 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstrict-overflow" +#endif + +struct ShflDownSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_down(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflDownMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_down(acc, threadIdxInWarp, 1) + == (threadIdxInWarp + 1 < warpExtent ? threadIdxInWarp + 1 : threadIdxInWarp)); + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + int const off = width * (threadIdxInWarp / width); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast(idx), width) + == ((threadIdxInWarp + idx < (width + off)) ? threadIdxInWarp + idx : threadIdxInWarp)); + float const ans = alpaka::warp::shfl_down( + acc, + 4.0f - float(threadIdxInWarp), + static_cast(idx), + width); + float const expect + = ((threadIdxInWarp + idx < (width + off)) ? (4.0f - float(threadIdxInWarp + idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + auto const shfl = alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast(idx)); + float const ans + = alpaka::warp::shfl_down(acc, 4.0f - float(threadIdxInWarp), static_cast(idx)); + float const expect + = ((threadIdxInWarp + idx < warpExtent / 2) ? (4.0f - float(threadIdxInWarp + idx)) : 0); + if(threadIdxInWarp + idx < warpExtent / 2) + { + ALPAKA_CHECK(*success, shfl == threadIdxInWarp + idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_down", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflDownSingleThreadWarpTestKernel{})); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<64>{})); + } +#endif + } + } +} + +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop +#endif diff --git a/test/unit/warp/src/ShflUp.cpp b/test/unit/warp/src/ShflUp.cpp new file mode 100644 index 000000000000..b1b67eb9e683 --- /dev/null +++ b/test/unit/warp/src/ShflUp.cpp @@ -0,0 +1,172 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +struct ShflUpSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_up(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflUpMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, 1) + == (threadIdxInWarp - 1 >= 0 ? threadIdxInWarp - 1 : threadIdxInWarp)); + + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + int const off = width * (threadIdxInWarp / width); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast(idx), width) + == ((threadIdxInWarp - idx >= off) ? threadIdxInWarp - idx : threadIdxInWarp)); + float const ans = alpaka::warp::shfl_up( + acc, + 4.0f - float(threadIdxInWarp), + static_cast(idx), + width); + float const expect + = ((threadIdxInWarp - idx >= off) ? (4.0f - float(threadIdxInWarp - idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast(idx)) + == ((threadIdxInWarp - idx >= 0) ? (threadIdxInWarp - idx) : threadIdxInWarp)); + float const ans + = alpaka::warp::shfl_up(acc, 4.0f - float(threadIdxInWarp), static_cast(idx)); + float const expect + = ((threadIdxInWarp - idx >= 0) ? (4.0f - float(threadIdxInWarp - idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_up", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflUpSingleThreadWarpTestKernel{})); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<64>{})); + } +#endif + } + } +} diff --git a/test/unit/warp/src/ShflXor.cpp b/test/unit/warp/src/ShflXor.cpp new file mode 100644 index 000000000000..977b7d62939e --- /dev/null +++ b/test/unit/warp/src/ShflXor.cpp @@ -0,0 +1,157 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +struct ShflXorSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, -1) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_xor(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflXorMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 1) == (threadIdxInWarp ^ 1)); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 5, -1) == 5); + + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx, width) == (threadIdxInWarp ^ idx)); + float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx, width); + float const expect = 4.0f - float(threadIdxInWarp ^ idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx) == (threadIdxInWarp ^ idx)); + float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx); + float const expect = 4.0f - float(threadIdxInWarp ^ idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_xor", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflXorSingleThreadWarpTestKernel{})); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<64>{})); + } +#endif + } + } +}