diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 89ebf5758a5b55..a14fd2c4b224d8 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -336,6 +336,7 @@ namespace clang { bool isTupleSet() const { return Flags & IsTupleSet; } bool isReadZA() const { return Flags & IsReadZA; } bool isWriteZA() const { return Flags & IsWriteZA; } + bool setsFPMR() const { return Flags & SetsFPMR; } bool isReductionQV() const { return Flags & IsReductionQV; } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index d492fae4145b92..2dec17aae2af55 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2422,14 +2422,16 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; } -// -// Multi-vector scaling -// -let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { +let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { + // Multi-vector scaling def FSCALE_SINGLE_X2 : Inst<"svscale[_single_{d}_x2]", "22x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x2", [IsStreaming],[]>; def FSCALE_SINGLE_X4 : Inst<"svscale[_single_{d}_x4]", "44x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x4", [IsStreaming],[]>; def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>; def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; + + // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector + def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 50911fb63e818e..de10be7bdce0db 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -94,6 +94,7 @@ include "arm_immcheck_incl.td" // l: int64_t // m: uint32_t // n: uint64_t +// >: fpm_t // [: svuint8_t // t: svint32_t @@ -103,6 +104,7 @@ include "arm_immcheck_incl.td" // M: svfloat32_t // N: svfloat64_t // $: svbfloat16_t +// ~: svmfloat8_t // J: Prefetch type (sv_prfop) @@ -235,6 +237,7 @@ def IsInOutZA : FlagType<0x200000000000>; def IsInZT0 : FlagType<0x400000000000>; def IsOutZT0 : FlagType<0x800000000000>; def IsInOutZT0 : FlagType<0x1000000000000>; +def SetsFPMR : FlagType<0x2000000000000>; defvar InvalidMode = ""; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 0916e14f182ddd..27e5453628a5a9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10811,6 +10811,10 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, else if (TypeFlags.isUndef()) return UndefValue::get(Ty); else if (Builtin->LLVMIntrinsic != 0) { + // Emit set FPMR for intrinsics that require it + if (TypeFlags.setsFPMR()) + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), + Ops.pop_back_val()); if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp) InsertExplicitZeroOperand(Builder, Ty, Ops); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c new file mode 100644 index 00000000000000..5ba76671ff5d5b --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -0,0 +1,81 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_cvtl1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16( [[ZN:%.*]]) +// CHECK-NEXT: ret { , } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_cvtl1_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { , } [[TMP0]] +// +svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvtl2_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16( [[ZN:%.*]]) +// CHECK-NEXT: ret { , } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_cvtl2_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { , } [[TMP0]] +// +svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvtl1_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16( [[ZN:%.*]]) +// CHECK-NEXT: ret { , } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z18test_cvtl1_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { , } [[TMP0]] +// +svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvtl2_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16( [[ZN:%.*]]) +// CHECK-NEXT: ret { , } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z18test_cvtl2_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { , } [[TMP0]] +// +svbfloat16x2_t test_cvtl2_bf16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr); +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c new file mode 100644 index 00000000000000..09a80c9dff03ea --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -verify -emit-llvm-only %s + +// REQUIRES: aarch64-registered-target + +#include + + +void test_features_sme2_fp8(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { + // expected-error@+1 {{'svcvtl1_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl1_f16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl2_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl2_f16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl1_bf16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl1_bf16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl2_bf16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl2_bf16_mf8_x2_fpm(zn, fpmr); +} \ No newline at end of file diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index e8883488f32356..e9fa01ea98dced 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -52,7 +52,7 @@ namespace { class SVEType { bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat, MFloat; bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp, - Svcount; + Svcount, Fpm; unsigned Bitwidth, ElementBitwidth, NumVectors; public: @@ -62,7 +62,7 @@ class SVEType { : Float(false), Signed(true), Immediate(false), Void(false), Constant(false), Pointer(false), BFloat(false), MFloat(false), DefaultType(false), IsScalable(true), Predicate(false), - PredicatePattern(false), PrefetchOp(false), Svcount(false), + PredicatePattern(false), PrefetchOp(false), Svcount(false), Fpm(false), Bitwidth(128), ElementBitwidth(~0U), NumVectors(NumVectors) { if (!TS.empty()) applyTypespec(TS); @@ -101,6 +101,7 @@ class SVEType { bool isPrefetchOp() const { return PrefetchOp; } bool isSvcount() const { return Svcount; } bool isConstant() const { return Constant; } + bool isFpm() const { return Fpm; } unsigned getElementSizeInBits() const { return ElementBitwidth; } unsigned getNumVectors() const { return NumVectors; } @@ -497,6 +498,9 @@ std::string SVEType::str() const { if (isPrefetchOp()) return "enum svprfop"; + if (isFpm()) + return "fpm_t"; + std::string S; if (Void) S += "void"; @@ -752,6 +756,9 @@ void SVEType::applyModifier(char Mod) { ElementBitwidth = Bitwidth = 32; NumVectors = 0; break; + case '>': + Fpm = true; + [[fallthrough]]; case 'n': Predicate = false; Svcount = false; @@ -926,6 +933,12 @@ void SVEType::applyModifier(char Mod) { Float = false; BFloat = false; break; + case '~': + Float = false; + BFloat = false; + MFloat = true; + ElementBitwidth = 8; + break; case '.': llvm_unreachable(". is never a type in itself"); break; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6a09a8647096f9..a91616b9556828 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3813,6 +3813,15 @@ let TargetPrefix = "aarch64" in { LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + class SME2_FP8_CVT_X2_Single_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_nxv16i8_ty], + [IntrReadMem, IntrInaccessibleMemOnly]>; + // + // CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector + // + def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; + def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10dad7675f4eaf..f9a4d65eac0922 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -383,6 +383,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); + void SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, unsigned Opc); @@ -1866,6 +1867,27 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs, + unsigned Opcode) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SmallVector Ops(N->op_begin() + 2, N->op_end()); + Ops.push_back(/*Chain*/ N->getOperand(0)); + + SDNode *Instruction = + CurDAG->getMachineNode(Opcode, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, @@ -5547,6 +5569,18 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } + case Intrinsic::aarch64_sve_fp8_cvtl1_x2: + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::BF1CVTL_2ZZ_BtoH, AArch64::F1CVTL_2ZZ_BtoH})) + SelectCVTIntrinsicFP8(Node, 2, Opc); + return; + case Intrinsic::aarch64_sve_fp8_cvtl2_x2: + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::BF2CVTL_2ZZ_BtoH, AArch64::F2CVTL_2ZZ_BtoH})) + SelectCVTIntrinsicFP8(Node, 2, Opc); + return; } } break; case ISD::INTRINSIC_WO_CHAIN: { diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 8c256b5818ee88..776472e72af05a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -2412,7 +2412,7 @@ multiclass sme2p1_fp_cvt_vector_vg2_single { // SME2 multi-vec FP8 up convert two registers multiclass sme2p1_fp8_cvt_vector_vg2_single opc, bit L> { - def _NAME : sme2_cvt_unpk_vector_vg2{ + def NAME : sme2_cvt_unpk_vector_vg2{ let Uses = [FPMR, FPCR]; } } diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll new file mode 100644 index 00000000000000..076a3ad34eac3c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming < %s | FileCheck %s + +; F1CVTL / F2CVTL + +define { , } @f1cvtl( %zm) { +; CHECK-LABEL: f1cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16( %zm) + ret { , } %res +} + +define { , } @f2cvtl( %zm) { +; CHECK-LABEL: f2cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxvbf16( %zm) + ret { , } %res +} + +; BF1CVTL / BF2CVTL + +define { , } @bf1cvtl( %zm) { +; CHECK-LABEL: bf1cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16( %zm) + ret { , } %res +} + +define { , } @bf2cvtl( %zm) { +; CHECK-LABEL: bf2cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16( %zm) + ret { , } %res +}