PTX: mbarrier.{test,try}_wait: Fix return value (#3670) (#3672)

* mbarrier.{test,try}_wait: Fix return value (cherry picked from commit f61670e) Co-authored-by: Allard Hendriksen <[email protected]>
NVIDIA · Feb 4, 2025 · 25901d7 · 25901d7
1 parent 7d3a4d7
commit 25901d7
Show file tree

Hide file tree

Showing 12 changed files with 122 additions and 156 deletions.
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -47,10 +47,9 @@ mbarrier.test_wait.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_test_wait(
+   __device__ static inline bool mbarrier_test_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state);
 
@@ -62,9 +61,8 @@ mbarrier.test_wait.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_test_wait(
+   __device__ static inline bool mbarrier_test_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -47,10 +47,9 @@ mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_test_wait_parity(
+   __device__ static inline bool mbarrier_test_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity);
 
@@ -62,9 +61,8 @@ mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_test_wait_parity(
+   __device__ static inline bool mbarrier_test_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -88,10 +88,9 @@ mbarrier.try_wait.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait(
+   __device__ static inline bool mbarrier_try_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state,
      const uint32_t& suspendTimeHint);
@@ -104,10 +103,9 @@ mbarrier.try_wait.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait(
+   __device__ static inline bool mbarrier_try_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state,
      const uint32_t& suspendTimeHint);
@@ -120,10 +118,9 @@ mbarrier.try_wait.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait(
+   __device__ static inline bool mbarrier_try_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state);
 
@@ -135,9 +132,8 @@ mbarrier.try_wait.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait(
+   __device__ static inline bool mbarrier_try_wait(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -88,10 +88,9 @@ mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait_parity(
+   __device__ static inline bool mbarrier_try_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity,
      const uint32_t& suspendTimeHint);
@@ -104,10 +103,9 @@ mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait_parity(
+   __device__ static inline bool mbarrier_try_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity,
      const uint32_t& suspendTimeHint);
@@ -120,10 +118,9 @@ mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait_parity(
+   __device__ static inline bool mbarrier_try_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity);
 
@@ -135,9 +132,8 @@ mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
    template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void mbarrier_try_wait_parity(
+   __device__ static inline bool mbarrier_try_wait_parity(
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
-     bool waitComplete,
      uint64_t* addr,
      const uint32_t& phaseParity);
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
@@ -89,49 +89,47 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ static inline void mbarrier_test_wait(
+__device__ static inline bool mbarrier_test_wait(
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
-  bool waitComplete,
   uint64_t* addr,
   const uint64_t& state);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
-_CCCL_DEVICE static inline void mbarrier_test_wait(
-  sem_relaxed_t,
-  scope_t<_Scope> __scope,
-  bool __waitComplete,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint64_t& __state)
+_CCCL_DEVICE static inline bool mbarrier_test_wait(
+  sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
 {
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
+  _CUDA_VSTD::uint32_t __waitComplete;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
   {
-    asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
-        "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
-        "mbarrier.test_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
+    asm("{\n\t .reg .pred P_OUT; \n\t"
+        "mbarrier.test_wait.relaxed.cta.shared::cta.b64 P_OUT, [%1], %2;\n\t"
+        "selp.b32 %0, 1, 0, P_OUT; \n"
         "}"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state)
+        : "=r"(__waitComplete)
+        : "r"(__as_ptr_smem(__addr)), "l"(__state)
         : "memory");
   }
   else _CCCL_IF_CONSTEXPR (__scope == scope_cluster)
   {
-    asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
-        "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
-        "mbarrier.test_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
+    asm("{\n\t .reg .pred P_OUT; \n\t"
+        "mbarrier.test_wait.relaxed.cluster.shared::cta.b64 P_OUT, [%1], %2;\n\t"
+        "selp.b32 %0, 1, 0, P_OUT; \n"
         "}"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state)
+        : "=r"(__waitComplete)
+        : "r"(__as_ptr_smem(__addr)), "l"(__state)
         : "memory");
   }
+  return static_cast<bool>(__waitComplete);
 #  else
   // Unsupported architectures will have a linker error with a semi-decent error message
   __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
+  return false;
 #  endif
 }
 #endif // __cccl_ptx_isa >= 860

diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
@@ -90,49 +90,47 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ static inline void mbarrier_test_wait_parity(
+__device__ static inline bool mbarrier_test_wait_parity(
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
-  bool waitComplete,
   uint64_t* addr,
   const uint32_t& phaseParity);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
-_CCCL_DEVICE static inline void mbarrier_test_wait_parity(
-  sem_relaxed_t,
-  scope_t<_Scope> __scope,
-  bool __waitComplete,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __phaseParity)
+_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
+  sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
 {
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
+  _CUDA_VSTD::uint32_t __waitComplete;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
   {
-    asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
-        "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
-        "mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
+    asm("{\n\t .reg .pred P_OUT; \n\t"
+        "mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 P_OUT, [%1], %2;\n\t"
+        "selp.b32 %0, 1, 0, P_OUT; \n"
         "}"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+        : "=r"(__waitComplete)
+        : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
         : "memory");
   }
   else _CCCL_IF_CONSTEXPR (__scope == scope_cluster)
   {
-    asm("{\n\t .reg .pred PRED_waitComplete; \n\t"
-        "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t"
-        "mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t"
+    asm("{\n\t .reg .pred P_OUT; \n\t"
+        "mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 P_OUT, [%1], %2;\n\t"
+        "selp.b32 %0, 1, 0, P_OUT; \n"
         "}"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+        : "=r"(__waitComplete)
+        : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
         : "memory");
   }
+  return static_cast<bool>(__waitComplete);
 #  else
   // Unsupported architectures will have a linker error with a semi-decent error message
   __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
+  return false;
 #  endif
 }
 #endif // __cccl_ptx_isa >= 860