[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate #69260

rampitec · 2023-10-16T22:39:48Z

This is a part of #67781. Until we select more 64-bit move immediates the impact is minimal.

This is a part of llvm#67781. Until we select more 64-bit move immediates the impact is minimal.

llvmbot · 2023-10-16T22:41:01Z

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

This is a part of #67781. Until we select more 64-bit move immediates the impact is minimal.

Full diff: https://github.com/llvm/llvm-project/pull/69260.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+38-15)
(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1)
(modified) llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll (+3-5)
(modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir (+226-1)
(modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+2-4)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..c0af9a9ad4d5855 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3203,11 +3203,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   switch (DefMI.getOpcode()) {
   default:
     return false;
+  case AMDGPU::V_MOV_B64_e32:
   case AMDGPU::S_MOV_B64:
-    // TODO: We could fold 64-bit immediates, but this get complicated
-    // when there are sub-registers.
-    return false;
-
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO:
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::S_MOV_B32:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
@@ -3220,19 +3219,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   if (!ImmOp->isImm())
     return false;
 
+  auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
+    int64_t Imm = ImmOp->getImm();
+    switch (UseOp.getSubReg()) {
+      default:
+        return Imm;
+      case AMDGPU::sub0:
+        return Lo_32(Imm);
+      case AMDGPU::sub1:
+        return Hi_32(Imm);
+      case AMDGPU::lo16:
+        return APInt(16, Imm).getSExtValue();
+      case AMDGPU::hi16:
+        return APInt(32, Imm).ashr(16).getSExtValue();
+      case AMDGPU::sub1_lo16:
+        return APInt(16, Hi_32(Imm)).getSExtValue();
+      case AMDGPU::sub1_hi16:
+        return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
+    }
+  };
+
+  assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
+
   unsigned Opc = UseMI.getOpcode();
   if (Opc == AMDGPU::COPY) {
+    assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
+
     Register DstReg = UseMI.getOperand(0).getReg();
-    bool Is16Bit = getOpSize(UseMI, 0) == 2;
+    unsigned OpSize = getOpSize(UseMI, 0);
+    bool Is16Bit = OpSize == 2;
+    bool Is64Bit = OpSize == 8;
     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
-    unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
-    APInt Imm(32, ImmOp->getImm());
-
-    if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
-      Imm = Imm.ashr(16);
+    unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
+                                           : AMDGPU::V_MOV_B32_e32
+                                 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
+                                           : AMDGPU::S_MOV_B32;
+    APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
 
     if (RI.isAGPR(*MRI, DstReg)) {
-      if (!isInlineConstant(Imm))
+      if (Is64Bit || !isInlineConstant(Imm))
         return false;
       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
     }
@@ -3317,7 +3342,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
         return false;
 
-      const int64_t Imm = ImmOp->getImm();
+      const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
@@ -3401,8 +3426,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
         return false;
 
-      const int64_t Imm = ImmOp->getImm();
-
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
@@ -3413,7 +3436,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
 
       // ChangingToImmediate adds Src2 back to the instruction.
-      Src2->ChangeToImmediate(Imm);
+      Src2->ChangeToImmediate(getImmFor(*Src2));
 
       // These come before src2.
       removeModOperands(UseMI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2e4708205523bca..7276e5666af3ef6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -151,6 +151,7 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
   let SchedRW = [WriteSALU, Write64Bit];
   let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
   let Uses = [];
+  let UseNamedOperandTable = 1;
 }
 
 // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
index ef3c95b17598f38..741164bc045062e 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -28,12 +28,10 @@ declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
 ; Function Attrs: norecurse
 define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
 ; GCN-LABEL: {{^}}svm_node_closure_bsdf:
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR:v[0-9]+]], s30,
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
-; GCN: s_movk_i32 s30, 0x60
+; GCN-NOT: v_writelane_b32
+; GCN: s_movk_i32 s28, 0x60
 ; GCN-NOT: s31
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]],
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]],
+; GCN-NOT: v_readlane_b32
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index ee2e5c7a4ed7094..908fa4e84ae6a3a 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
 ; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], {{[sv]}}[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
 ; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; PACKED-SDAG:    v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], {{[sv]}}[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 099aaa449b1c9e4..204c1367d84d874 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s
 
 ---
 name:            fold_simm_virtual
@@ -120,3 +120,228 @@ body:             |
     SI_RETURN_TO_EPILOG $vgpr0_lo16
 
 ...
+
+---
+name:            fold_sreg_64_sub0_to_vgpr_32
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_sub0_to_vgpr_32
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %1:vgpr_32 = COPY killed %0.sub0
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_sreg_64_sub1_to_vgpr_32
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_sub1_to_vgpr_32
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
+    %0:sreg_64 = S_MOV_B64 1311768467750121200
+    %1:vgpr_32 = COPY killed %0.sub1
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_vreg_64_sub1_to_vgpr_32
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vreg_64_sub1_to_vgpr_32
+    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
+    %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    %1:vgpr_32 = COPY killed %0.sub1
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_sreg_64_to_vreg_64
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_to_vreg_64
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %1:vreg_64_align2 = COPY killed %0
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_sreg_64_to_sreg_64
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_to_sreg_64
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
+    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
+    %0:sreg_64 = S_MOV_B64 1311768467750121200
+    %1:sreg_64 = COPY killed %0
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_sreg_64_lo16_to_sgpr_lo16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_lo16_to_sgpr_lo16
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 1
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    $sgpr0_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+
+---
+name:            fold_sreg_64_hi16_to_sgpr_lo16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_hi16_to_sgpr_lo16
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 2
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    $sgpr0_lo16 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+
+---
+name:            fold_sreg_64_sub1_lo16_to_sgpr_lo16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_sub1_lo16_to_sgpr_lo16
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 3
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    $sgpr0_lo16 = COPY killed %0.sub1_lo16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+
+---
+name:            fold_sreg_64_sub1_hi16_to_sgpr_lo16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_sreg_64_sub1_hi16_to_sgpr_lo16
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 4
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
+    $sgpr0_lo16 = COPY killed %0.sub1_hi16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+
+---
+name:            fmac_sreg_64_sub0_src0_to_fmamk
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fmac_sreg_64_sub0_src0_to_fmamk
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 2882399984, [[DEF1]], implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vgpr_32 = V_FMAC_F32_e64 0, %2.sub0, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fmac_sreg_64_sub1_src0_to_fmamk
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fmac_sreg_64_sub1_src0_to_fmamk
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 305419896, [[DEF1]], implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vgpr_32 = V_FMAC_F32_e64 0, %2.sub1, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fmac_sreg_64_sub1_src1_to_fmaak
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fmac_sreg_64_sub1_src1_to_fmaak
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 305419896, [[DEF1]], implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, %2.sub1, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fma_sreg_64_sub0_to_fmaak
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fma_sreg_64_sub0_to_fmaak
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 2882399984, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fma_sreg_64_sub1_to_fmaak
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fma_sreg_64_sub1_to_fmaak
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 305419896, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a462c19ce645d4a..17b387be7925803 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -573,8 +573,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
 ; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
-; GFX900-NEXT:    s_movk_i32 s0, 0x5000
-; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, 0x5000, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX900-NEXT:    s_movk_i32 s2, 0x7f
@@ -805,8 +804,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0x5000
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x5000, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    s_movk_i32 s2, 0x7f
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0

github-actions · 2023-10-16T22:41:35Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff fa7d6a0fa5eafabde83f07065eb46eb144b715de a2bcd66c1316dd466429382596d97a347fc959fe -- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

View the diff from clang-format here.

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ad07550c763..2ec9eb3af973 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3250,10 +3250,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     bool Is16Bit = OpSize == 2;
     bool Is64Bit = OpSize == 8;
     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
-    unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
-                                           : AMDGPU::V_MOV_B32_e32
-                                 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
-                                           : AMDGPU::S_MOV_B32;
+    unsigned NewOpc =
+        isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32
+        : Is64Bit  ? AMDGPU::S_MOV_B64_IMM_PSEUDO
+                   : AMDGPU::S_MOV_B32;
     APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
 
     if (RI.isAGPR(*MRI, DstReg)) {

I do not like its other suggesting and going to ignore it.

Sisyph

LGTM, besides nit.

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

jayfoad · 2023-10-18T13:56:49Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+    case AMDGPU::sub1_lo16:
+      return APInt(16, Hi_32(Imm)).getSExtValue();
+    case AMDGPU::sub1_hi16:
+      return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();


Nit: seems over-engineered to use APInt. Simpler and faster to return (int16_t)(Imm >> 48).

It is not a same value though? The cast to int16_t will truncate it?

int16_t is signed so the implicit conversion to the return type will sign extend it.

jayfoad · 2023-10-18T13:59:38Z

llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -151,6 +151,7 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
  let SchedRW = [WriteSALU, Write64Bit];
  let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
  let Uses = [];
+  let UseNamedOperandTable = 1;


What is this for?

This was missing, and we are querying names operand src0 while folding.

Makes sense, thanks. I did not even know there was a flag for this. I thought it happened automatically.

[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate

fcd9aa9

This is a part of llvm#67781. Until we select more 64-bit move immediates the impact is minimal.

rampitec requested review from jayfoad and Sisyph October 16, 2023 22:39

llvmbot added the backend:AMDGPU label Oct 16, 2023

Make clang-format partially happy

622bd8a

I do not like its other suggesting and going to ignore it.

Sisyph approved these changes Oct 17, 2023

View reviewed changes

llvm/test/CodeGen/AMDGPU/packed-fp32.ll Outdated Show resolved Hide resolved

Revert unneeded test changes

a2bcd66

rampitec merged commit a22a1fe into llvm:main Oct 17, 2023
1 of 2 checks passed

rampitec deleted the fold-64-imm branch October 17, 2023 17:53

jayfoad reviewed Oct 18, 2023

View reviewed changes

madhur13490 mentioned this pull request Oct 20, 2023

Revert commit ba8565fbcb975e2d067ce3ae5a7dbaae4953edd3 madhur13490/llvm-project#3

Closed

banach-space mentioned this pull request Oct 24, 2023

[mlir][vector] Add scalable vectors to tests for vector.contract #70039

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate #69260

[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate #69260

rampitec commented Oct 16, 2023

llvmbot commented Oct 16, 2023

github-actions bot commented Oct 16, 2023 •

edited

Loading

Sisyph left a comment

jayfoad Oct 18, 2023

rampitec Oct 18, 2023

jayfoad Oct 18, 2023

jayfoad Oct 18, 2023

rampitec Oct 18, 2023

jayfoad Oct 18, 2023

[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate #69260

[AMDGPU] support 64-bit immediates in SIInstrInfo::FoldImmediate #69260

Conversation

rampitec commented Oct 16, 2023

llvmbot commented Oct 16, 2023

github-actions bot commented Oct 16, 2023 • edited Loading

Sisyph left a comment

Choose a reason for hiding this comment

jayfoad Oct 18, 2023

Choose a reason for hiding this comment

rampitec Oct 18, 2023

Choose a reason for hiding this comment

jayfoad Oct 18, 2023

Choose a reason for hiding this comment

jayfoad Oct 18, 2023

Choose a reason for hiding this comment

rampitec Oct 18, 2023

Choose a reason for hiding this comment

jayfoad Oct 18, 2023

Choose a reason for hiding this comment

github-actions bot commented Oct 16, 2023 •

edited

Loading