Skip to content

Commit

Permalink
[RISCV] Reduce the LMUL for a vrgather operation if legal (llvm#125768)
Browse files Browse the repository at this point in the history
If we're lowering a shuffle to a vrgather (or vcompress), and we know
that a prefix of the operation can be done while producing the same
(defined) lanes, do the operation with a narrower LMUL.
  • Loading branch information
preames authored Feb 6, 2025
1 parent f5c4f27 commit f2ac265
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 49 deletions.
28 changes: 25 additions & 3 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5645,6 +5645,30 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
}
}

// If only a prefix of the source elements influence a prefix of the
// destination elements, try to see if we can reduce the required LMUL
unsigned MinVLen = Subtarget.getRealMinVLen();
unsigned MinVLMAX = MinVLen / VT.getScalarSizeInBits();
if (NumElts > MinVLMAX) {
unsigned MaxIdx = 0;
for (auto [I, M] : enumerate(Mask)) {
if (M == -1)
continue;
MaxIdx = std::max(std::max((unsigned)I, (unsigned)M), MaxIdx);
}
unsigned NewNumElts =
std::max((uint64_t)MinVLMAX, PowerOf2Ceil(MaxIdx + 1));
if (NewNumElts != NumElts) {
MVT NewVT = MVT::getVectorVT(VT.getVectorElementType(), NewNumElts);
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, V1, ZeroIdx);
SDValue Res = DAG.getVectorShuffle(NewVT, DL, V1, DAG.getUNDEF(NewVT),
Mask.take_front(NewNumElts));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Res,
ZeroIdx);
}
}

// Before hitting generic lowering fallbacks, try to widen the mask
// to a wider SEW.
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
Expand Down Expand Up @@ -5717,9 +5741,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
SDValue Gather;
// If we have a locally repeating mask, then we can reuse the first register
// in the index register group for all registers within the source register
// group. TODO: This generalizes to m2, and m4. Also, this is currently
// picking up cases with a fully undef tail which could be more directly
// handled with fewer redundant vrgathers
// group. TODO: This generalizes to m2, and m4.
const MVT M1VT = getLMUL1VT(ContainerVT);
auto VLMAX = RISCVTargetLowering::computeVLMAXBounds(M1VT, Subtarget).first;
if (ContainerVT.bitsGT(M1VT) && isLocalRepeatingShuffle(Mask, VLMAX)) {
Expand Down
48 changes: 14 additions & 34 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1341,36 +1341,16 @@ define void @shuffle_i256_splat(ptr %p) nounwind {
}

define <16 x i32> @shuffle_m1_prefix(<16 x i32> %a) {
; RV32-LABEL: shuffle_m1_prefix:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, %hi(.LCPI84_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI84_0)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; RV32-NEXT: vrgatherei16.vv v13, v9, v16
; RV32-NEXT: vrgatherei16.vv v12, v8, v16
; RV32-NEXT: vrgatherei16.vv v14, v10, v16
; RV32-NEXT: vrgatherei16.vv v15, v11, v16
; RV32-NEXT: vmv4r.v v8, v12
; RV32-NEXT: ret
;
; RV64-LABEL: shuffle_m1_prefix:
; RV64: # %bb.0:
; RV64-NEXT: lui a0, 131073
; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: addi a0, a0, 3
; RV64-NEXT: slli a0, a0, 16
; RV64-NEXT: addi a0, a0, 2
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vmv.v.x v16, a0
; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; RV64-NEXT: vrgatherei16.vv v13, v9, v16
; RV64-NEXT: vrgatherei16.vv v12, v8, v16
; RV64-NEXT: vrgatherei16.vv v14, v10, v16
; RV64-NEXT: vrgatherei16.vv v15, v11, v16
; RV64-NEXT: vmv4r.v v8, v12
; RV64-NEXT: ret
; CHECK-LABEL: shuffle_m1_prefix:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, 8208
; CHECK-NEXT: addi a0, a0, 770
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v9, a0
; CHECK-NEXT: vsext.vf4 v10, v9
; CHECK-NEXT: vrgather.vv v12, v8, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
ret <16 x i32> %out
}
Expand All @@ -1380,10 +1360,10 @@ define <16 x i32> @shuffle_m2_prefix(<16 x i32> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI85_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI85_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle16.v v14, (a0)
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 5, i32 2, i32 3, i32 5, i32 7, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
ret <16 x i32> %out
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -525,29 +525,25 @@ define void @vnsrl_0_i32_single_src_m8(ptr %in, ptr %out) {
; V-NEXT: li a2, 64
; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; V-NEXT: vle32.v v8, (a0)
; V-NEXT: lui a0, 341
; V-NEXT: addiw a0, a0, 1365
; V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; V-NEXT: vmv.s.x v16, a0
; V-NEXT: vsetivli zero, 16, e32, m2, ta, ma
; V-NEXT: vnsrl.wi v16, v8, 0
; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; V-NEXT: vcompress.vm v24, v8, v16
; V-NEXT: vse32.v v24, (a1)
; V-NEXT: vse32.v v16, (a1)
; V-NEXT: ret
;
; ZVE32F-LABEL: vnsrl_0_i32_single_src_m8:
; ZVE32F: # %bb.0: # %entry
; ZVE32F-NEXT: li a2, 64
; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; ZVE32F-NEXT: vle32.v v8, (a0)
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; ZVE32F-NEXT: vmv.v.i v16, 0
; ZVE32F-NEXT: lui a0, 341
; ZVE32F-NEXT: addi a0, a0, 1365
; ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma
; ZVE32F-NEXT: vmv.s.x v16, a0
; ZVE32F-NEXT: vmv.s.x v12, a0
; ZVE32F-NEXT: li a0, 32
; ZVE32F-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; ZVE32F-NEXT: vcompress.vm v16, v8, v12
; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; ZVE32F-NEXT: vcompress.vm v24, v8, v16
; ZVE32F-NEXT: vse32.v v24, (a1)
; ZVE32F-NEXT: vse32.v v16, (a1)
; ZVE32F-NEXT: ret
entry:
%0 = load <64 x i32>, ptr %in, align 4
Expand Down

0 comments on commit f2ac265

Please sign in to comment.