[Clang][AArch64] Add diagnostics for builtins that use ZT0. (#79140)

Similar to what we did for ZA, this patch adds diagnostics to flag when using a ZT0 builtin in a function that does not have ZT0 state.
llvm · Jan 23, 2024 · 1f6f199 · 1f6f199
1 parent d3a6a90
commit 1f6f199
Show file tree

Hide file tree

Showing 15 changed files with 122 additions and 76 deletions.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3170,6 +3170,9 @@ def warn_attribute_arm_sm_incompat_builtin : Warning<
 def warn_attribute_arm_za_builtin_no_za_state : Warning<
   "builtin call is not valid when calling from a function without active ZA state">,
   InGroup<DiagGroup<"undefined-arm-za">>;
+def warn_attribute_arm_zt0_builtin_no_zt0_state : Warning<
+  "builtin call is not valid when calling from a function without active ZT0 state">,
+  InGroup<DiagGroup<"undefined-arm-zt0">>;
 def err_sve_vector_in_non_sve_target : Error<
   "SVE vector type %0 cannot be used in a target without sve">;
 def err_attribute_riscv_rvv_bits_unsupported : Error<

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
@@ -636,37 +636,37 @@ let TargetGuard = "sme2,sme-i16i64" in {
 // Spill and fill of ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
-  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // Zero ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // lookup table expand four contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
 }
 
 //
 // lookup table expand one register
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
+  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
 }
 
 //
 // lookup table expand two contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
-  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -229,6 +229,9 @@ def IsStreamingOrSVE2p1             : FlagType<0x40000000000>; // Use for intrin
 def IsInZA                          : FlagType<0x80000000000>;
 def IsOutZA                         : FlagType<0x100000000000>;
 def IsInOutZA                       : FlagType<0x200000000000>;
+def IsInZT0                         : FlagType<0x400000000000>;
+def IsOutZT0                        : FlagType<0x800000000000>;
+def IsInOutZT0                      : FlagType<0x1000000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
@@ -3013,6 +3013,11 @@ enum ArmSMEState : unsigned {
   ArmOutZA = 0b10,
   ArmInOutZA = 0b11,
   ArmZAMask = 0b11,
+
+  ArmInZT0 = 0b01 << 2,
+  ArmOutZT0 = 0b10 << 2,
+  ArmInOutZT0 = 0b11 << 2,
+  ArmZT0Mask = 0b11 << 2
 };
 
 bool Sema::ParseSVEImmChecks(
@@ -3206,6 +3211,13 @@ static bool hasArmZAState(const FunctionDecl *FD) {
          (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
 }
 
+static bool hasArmZT0State(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZT0State(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZT0());
+}
+
 static ArmSMEState getSMEState(unsigned BuiltinID) {
   switch (BuiltinID) {
   default:
@@ -3233,6 +3245,11 @@ bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       Diag(TheCall->getBeginLoc(),
            diag::warn_attribute_arm_za_builtin_no_za_state)
           << TheCall->getSourceRange();
+
+    if ((getSMEState(BuiltinID) & ArmZT0Mask) && !hasArmZT0State(FD))
+      Diag(TheCall->getBeginLoc(),
+           diag::warn_attribute_arm_zt0_builtin_no_zt0_state)
+          << TheCall->getSourceRange();
   }
 
   // Range check SME intrinsics that take immediate values.

diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -20,7 +20,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("za") {
+void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("zt0") {
   svldr_zt(0, base);
 }
 
@@ -36,6 +36,6 @@ void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("za")
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstr_zt(void *base) __arm_streaming_compatible __arm_in("za") {
+void test_svstr_zt(void *base) __arm_streaming_compatible __arm_in("zt0") {
   svstr_zt(0, base);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -19,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8(0, zn, 15);
 }
 
@@ -34,7 +34,7 @@ svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8(0, zn, 15);
 }
 
@@ -48,7 +48,7 @@ svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16(0, zn, 15);
 }
 
@@ -63,7 +63,7 @@ svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16(0, zn, 15);
 }
 
@@ -77,7 +77,7 @@ svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.luti2.lane.zt.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16(0, zn, 15);
 }
 
@@ -91,7 +91,7 @@ svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.luti2.lane.zt.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16(0, zn, 15);
 }
 
@@ -105,7 +105,7 @@ svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32(0, zn, 15);
 }
 
@@ -119,7 +119,7 @@ svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32(0, zn, 15);
 }
 
@@ -133,6 +133,6 @@ svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.luti2.lane.zt.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32(0, zn, 15);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -26,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8_x2(0, zn, 7);
 }
 
@@ -49,7 +49,7 @@ svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8_x2(0, zn, 7);
 }
 
@@ -71,7 +71,7 @@ svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16_x2(0, zn, 7);
 }
 
@@ -94,7 +94,7 @@ svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16_x2(0, zn, 7);
 }
 
@@ -116,7 +116,7 @@ svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16_x2(0, zn, 7);
 }
 
@@ -138,7 +138,7 @@ svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16_x2(0, zn, 7);
 }
 
@@ -160,7 +160,7 @@ svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32_x2(0, zn, 7);
 }
 
@@ -182,7 +182,7 @@ svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32_x2(0, zn, 7);
 }
 
@@ -204,6 +204,6 @@ svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32_x2(0, zn, 7);
 }