-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Fix postinc operands for Cortex-A510 scheduling #68518
Conversation
Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc loads/stores in the Cortex-A510 scheduling model. I will add the same for other models too, this will be the most impactful due to it being the default cpu scheduling model.
@llvm/pr-subscribers-backend-aarch64 ChangesSimilar to D159254, this fixes the order of WriteAdr operands on post/pre-inc loads/stores in the Cortex-A510 scheduling model. I will add the same for other models too, this will be the most impactful due to it being the default cpu scheduling model. Patch is 200.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68518.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index fab2cda87807554..1afbc5d9102ca96 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -295,16 +295,16 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
// 2-element structures
def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
@@ -312,10 +312,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
// 3-element structures
def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
@@ -323,10 +323,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
// 4-element structures
def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs.
@@ -334,10 +334,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs.
def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
//---
// Vector Stores
@@ -347,28 +347,28 @@ def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d
def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
//---
// Floating Point Conversions, MAC, DIV, SQRT
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
index cbda7b027587d9c..07fbe5d7310f60f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
@@ -47,10 +47,10 @@ define void @f_undef_1(<8 x i64> %a, ptr %dst) {
; CHECK-LABEL: f_undef_1:
; CHECK: // %bb.0: // %BB
; CHECK-NEXT: mov v16.16b, v0.16b
-; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: mov v5.16b, v2.16b
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q1_q2
; CHECK-NEXT: // kill: def $q3 killed $q3 def $q3_q4
+; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: mov v4.16b, v3.16b
; CHECK-NEXT: mov v17.16b, v16.16b
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 6657b19d24929d8..7d73e1c6c1d7f41 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14320,8 +14320,8 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) {
; CHECK: ; %bb.0:
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.b { v0 }[1], [x0], #1
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1lane:
@@ -14345,8 +14345,8 @@ define <8 x i8> @test_v8i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <8 x i
; CHECK: ; %bb.0:
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.b { v0 }[1], [x0], x2
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v8i8_post_reg_ld1lane:
@@ -14413,8 +14413,8 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A)
; CHECK: ; %bb.0:
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.h { v0 }[1], [x0], #2
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v4i16_post_imm_ld1lane:
@@ -14439,8 +14439,8 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4 x
; CHECK-NEXT: lsl x8, x2, #1
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v4i16_post_reg_ld1lane:
@@ -14507,8 +14507,8 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A)
; CHECK: ; %bb.0:
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v2i32_post_imm_ld1lane:
@@ -14533,8 +14533,8 @@ define <2 x i32> @test_v2i32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2 x
; CHECK-NEXT: lsl x8, x2, #2
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v2i32_post_reg_ld1lane:
@@ -14644,8 +14644,8 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float>
; CHECK: ; %bb.0:
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v2f32_post_imm_ld1lane:
@@ -14670,8 +14670,8 @@ define <2 x float> @test_v2f32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2
; CHECK-NEXT: lsl x8, x2, #2
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
;
; CHECK-GISEL-LABEL: test_v2f32_post_reg_ld1lane:
@@ -14776,9 +14776,9 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(ptr %bar, ptr %ptr,
; CHECK-NEXT: lsl x8, x2, #1
; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: str x0, [x1]
; CHECK-NEXT: ldr d1, [x3]
+; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: cnt.8b v1, v1
; CHECK-NEXT: uaddlp.4h v1, v1
; CHECK-NEXT: uaddlp.2s v1, v1
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 99f573795489a08..849fc7aa00a8e7e 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -365,15 +365,15 @@ define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ldp s3, s2, [x2]
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: ushll v3.8h, v3.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3
; CHECK-NEXT: ushll v1.4s, v1.4h, #3
+; CHECK-NEXT: ushll v2.4s, v2.4h, #3
; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h
; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v0.8h
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
@@ -407,10 +407,10 @@ define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
-; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b
; CHECK-NEXT: ret
@@ -444,10 +444,10 @@ define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
-; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: shll v3.4s, v2.4h, #16
@@ -489,18 +489,18 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
-; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
-; CHECK-NEXT: ldp s4, s5, [x4]
+; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: usubl v1.8h, v0.8b, v1.8b
+; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
-; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4
-; CHECK-NEXT: ldp s6, s7, [x6]
+; CHECK-NEXT: ldp s4, s5, [x4]
; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4
; CHECK-NEXT: ld1 { v5.s }[1], [x5]
-; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4
+; CHECK-NEXT: ldp s6, s7, [x6]
; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4
; CHECK-NEXT: ld1 { v7.s }[1], [x7]
; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b
; CHECK-NEXT: shll v0.4s, v4.4h, #16
@@ -647,7 +647,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: add x8, x3, #8
-; CHECK-NEXT: add x11, x1, #12
+; CHECK-NEXT: add x11, x3, #12
; CHECK-NEXT: str s1, [x4]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: ldp s0, s5, [x2]
@@ -664,16 +664,16 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x9, x1, #4
; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b
; CHECK-NEXT: mov v0.b[11], w10
-; CHECK-NEXT: add x10, x3, #12
+; CHECK-NEXT: add x10, x1, #12
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
; CHECK-NEXT: ldr s4, [x0, #12]
; CHECK-NEXT: ldp s3, s16, [x0, #4]
-; CHECK-NEXT: ldp s6, s7, [x2, #8]
-; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
+; CHECK-NEXT: ldp s6, s7, [x2, #8]
+; CHECK-NEXT: ld1 { v4.s }[1], [x10]
; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v6.s }[1], [x8]
-; CHECK-NEXT: ld1 { v7.s }[1], [x10]
+; CHECK-NEXT: ld1 { v7.s }[1], [x11]
; CHECK-NEXT: add x8, x1, #8
; CHECK-NEXT: ld1 { v16.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b
@@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_shuffle:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp s2, s7, [x0, #8]
-; CHECK-NEXT: add x8, x3, #8
-; CHECK-NEXT: ldr s18, [x1, #12]
-; CHECK-NEXT: ldp s0, s1, [x2]
-; CHECK-NEXT: ldp s3, s16, [x0]
-; CHECK-NEXT: add x9, x1, #8
-; CHECK-NEXT: mov v4.16b, v7.16b
-; CHECK-NEXT: ldp s6, s17, [x2, #8]
+; CHECK-NEXT: ldp s0, s1, [x0, #8]
+; CHECK-NEXT: add x8, x1, #8
+; CHECK-NEXT: ldr s6, [x1, #12]
+; CHECK-NEXT: ldp s17, s18, [x2, #8]
+; CHECK-NEXT: ldp s2, s3, [x2]
+; CHECK-NEXT: add x9, x3, #8
+; CHECK-NEXT: mov v4.16b, v1.16b
+; CHECK-NEXT: ldp s7, s16, [x0]
; CHECK-NEXT: ldr s5, [x3, #12]
-; CHECK-NEXT: mov v7.s[1], v18.s[0]
-; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT: mov v4.s[1], v18.s[0]
-; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4
-; CHECK-NEXT: ld1 { v2.s }[1], [x9]
-; CHECK-NEXT: ld1 { v6.s }[1], [x8]
-; CHECK-NEXT: ld1 { v1.s }[1], [x3]
+; CHECK-NEXT: mov v1.s[1], v6.s[0]
+; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT: mov v4.s[1], v6.s[0]
+; CHECK-NEXT: ld1 { v7.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v16.s }[1], [x1]
-; CHECK-NEXT: mov v4.s[2], v17.s[0]
-; CHECK-NEXT: mov v17.s[1], v5.s[0]
-; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b
-; CHECK-NEXT: uaddl v7.8h, v16.8b, v7.8b
-; CHECK-NEXT: uaddl v1.8h, v1.8b, v17.8b
+; CHECK-NEXT: ld1 { v3.s }[1], [x3]
+; CHECK-NEXT: ld1 { v0.s }[1], [x8]
+; CHECK-NEXT: ld1 { v17.s }[1], [x9]
+; CHECK-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for the changes. Looks good
Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc loads/stores in the Cortex-A510 scheduling model.
I will add the same for other models too, this will be the most impactful due to it being the default cpu scheduling model.