Skip to content

Commit

Permalink
PTX: mbarrier.{test,try}_wait: Fix return value (#3670) (#3672)
Browse files Browse the repository at this point in the history
* mbarrier.{test,try}_wait: Fix return value

(cherry picked from commit f61670e)

Co-authored-by: Allard Hendriksen <[email protected]>
  • Loading branch information
github-actions[bot] and ahendriksen authored Feb 4, 2025
1 parent 7d3a4d7 commit 25901d7
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 156 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ mbarrier.test_wait.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait(
__device__ static inline bool mbarrier_test_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state);
Expand All @@ -62,9 +61,8 @@ mbarrier.test_wait.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait(
__device__ static inline bool mbarrier_test_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state);
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait_parity(
__device__ static inline bool mbarrier_test_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity);
Expand All @@ -62,9 +61,8 @@ mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait_parity(
__device__ static inline bool mbarrier_test_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity);
12 changes: 4 additions & 8 deletions docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,9 @@ mbarrier.try_wait.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait(
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state,
const uint32_t& suspendTimeHint);
Expand All @@ -104,10 +103,9 @@ mbarrier.try_wait.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait(
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state,
const uint32_t& suspendTimeHint);
Expand All @@ -120,10 +118,9 @@ mbarrier.try_wait.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait(
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state);
Expand All @@ -135,9 +132,8 @@ mbarrier.try_wait.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait(
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state);
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,9 @@ mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait_parity(
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity,
const uint32_t& suspendTimeHint);
Expand All @@ -104,10 +103,9 @@ mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait_parity(
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity,
const uint32_t& suspendTimeHint);
Expand All @@ -120,10 +118,9 @@ mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait_parity(
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity);
Expand All @@ -135,9 +132,8 @@ mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_try_wait_parity(
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity);
Original file line number Diff line number Diff line change
Expand Up @@ -89,49 +89,47 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait(
__device__ static inline bool mbarrier_test_wait(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint64_t& state);
*/
#if __cccl_ptx_isa >= 860
extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
template <dot_scope _Scope>
_CCCL_DEVICE static inline void mbarrier_test_wait(
sem_relaxed_t,
scope_t<_Scope> __scope,
bool __waitComplete,
_CUDA_VSTD::uint64_t* __addr,
const _CUDA_VSTD::uint64_t& __state)
_CCCL_DEVICE static inline bool mbarrier_test_wait(
sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
{
// __sem == sem_relaxed (due to parameter type constraint)
static_assert(__scope == scope_cta || __scope == scope_cluster, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
_CUDA_VSTD::uint32_t __waitComplete;
_CCCL_IF_CONSTEXPR (__scope == scope_cta)
{
asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
"setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
"mbarrier.test_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
asm("{\n\t .reg .pred P_OUT; \n\t"
"mbarrier.test_wait.relaxed.cta.shared::cta.b64 P_OUT, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state)
: "=r"(__waitComplete)
: "r"(__as_ptr_smem(__addr)), "l"(__state)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__scope == scope_cluster)
{
asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
"setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
"mbarrier.test_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
asm("{\n\t .reg .pred P_OUT; \n\t"
"mbarrier.test_wait.relaxed.cluster.shared::cta.b64 P_OUT, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state)
: "=r"(__waitComplete)
: "r"(__as_ptr_smem(__addr)), "l"(__state)
: "memory");
}
return static_cast<bool>(__waitComplete);
# else
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
return false;
# endif
}
#endif // __cccl_ptx_isa >= 860
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,49 +90,47 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
// .sem = { .relaxed }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline void mbarrier_test_wait_parity(
__device__ static inline bool mbarrier_test_wait_parity(
cuda::ptx::sem_relaxed_t,
cuda::ptx::scope_t<Scope> scope,
bool waitComplete,
uint64_t* addr,
const uint32_t& phaseParity);
*/
#if __cccl_ptx_isa >= 860
extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
template <dot_scope _Scope>
_CCCL_DEVICE static inline void mbarrier_test_wait_parity(
sem_relaxed_t,
scope_t<_Scope> __scope,
bool __waitComplete,
_CUDA_VSTD::uint64_t* __addr,
const _CUDA_VSTD::uint32_t& __phaseParity)
_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
{
// __sem == sem_relaxed (due to parameter type constraint)
static_assert(__scope == scope_cta || __scope == scope_cluster, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
_CUDA_VSTD::uint32_t __waitComplete;
_CCCL_IF_CONSTEXPR (__scope == scope_cta)
{
asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
"setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
"mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
asm("{\n\t .reg .pred P_OUT; \n\t"
"mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 P_OUT, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
: "=r"(__waitComplete)
: "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__scope == scope_cluster)
{
asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
"setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
"mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
asm("{\n\t .reg .pred P_OUT; \n\t"
"mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 P_OUT, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
: "=r"(__waitComplete)
: "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
: "memory");
}
return static_cast<bool>(__waitComplete);
# else
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
return false;
# endif
}
#endif // __cccl_ptx_isa >= 860
Expand Down
Loading

0 comments on commit 25901d7

Please sign in to comment.