From fa4c588e340c5939a3a1a11012ef0b51d26a22d0 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 16 Jan 2024 22:48:49 +0100 Subject: [PATCH] Fix: do no use .cfi_negate_ra_state within the assembly on Arm64 Compiling openzfs on aarch64 with gcc-8 and gcc-9 is failing currently. See issue #14965 for deeper context. I have re-generated the assembly without the "-mbranch-protection=standard" compiling option. I have tested this on Arm64 FreeBSD 13.2 and AlmaLinux-8. Signed-off-by: Tino Reichardt Closes: #14965 --- .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 3569 +++++++-------- .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 3831 ++++++++--------- 2 files changed, 3659 insertions(+), 3741 deletions(-) diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index dc2719d142db..3a923e6a76a1 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -22,7 +22,7 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale - * Copyright (c) 2022-2023 Tino Reichardt + * Copyright (c) 2022-2024 Tino Reichardt * * This is converted assembly: SSE2 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde @@ -32,30 +32,17 @@ */ #if defined(__aarch64__) - .text - .section .note.gnu.property,"a",@note - .p2align 3 - .word 4 - .word 16 - .word 5 - .asciz "GNU" - .word 3221225472 - .word 4 - .word 3 - .word 0 -.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse2 .p2align 2 .type zfs_blake3_compress_in_place_sse2,@function zfs_blake3_compress_in_place_sse2: .cfi_startproc - hint #25 - .cfi_negate_ra_state sub sp, sp, #96 + .cfi_def_cfa_offset 96 stp x29, x30, [sp, #64] - add x29, sp, #64 str x19, [sp, #80] + add x29, sp, #64 .cfi_def_cfa w29, 32 .cfi_offset w19, -16 .cfi_offset w30, -24 @@ -72,18 +59,22 @@ zfs_blake3_compress_in_place_sse2: ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b eor v1.16b, v3.16b, v1.16b - ldp x29, x30, [sp, #64] stp q0, q1, [x19] + .cfi_def_cfa wsp, 96 + ldp x29, x30, [sp, #64] ldr x19, [sp, #80] add sp, sp, #96 - hint #29 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w30 + .cfi_restore w29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 + .p2align 4, 0x0 .LCPI1_0: .xword -4942790177982912921 .xword -6534734903820487822 @@ -92,458 +83,583 @@ zfs_blake3_compress_in_place_sse2: .type compress_pre,@function compress_pre: .cfi_startproc - hint #34 - fmov s1, w3 - movi d0, #0x0000ff000000ff - ldr q2, [x1] - fmov d3, x4 + fmov s0, w3 + movi d1, #0x0000ff000000ff + ldr q4, [x1] + fmov d2, x4 adrp x8, .LCPI1_0 - mov v1.s[1], w5 - str q2, [x0] - ldr q4, [x8, :lo12:.LCPI1_0] + mov v0.s[1], w5 + str q4, [x0] + ldr q6, [x8, :lo12:.LCPI1_0] add x8, x2, #32 - ldr q5, [x1, #16] - and v0.8b, v1.8b, v0.8b - stp q5, q4, [x0, #16] - mov v3.d[1], v0.d[0] - str q3, [x0, #48] - ldp q0, q6, [x2] - uzp1 v1.4s, v0.4s, v6.4s - uzp2 v0.4s, v0.4s, v6.4s - add v2.4s, v2.4s, v1.4s - uzp1 v18.4s, v1.4s, v1.4s - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v0.4s - rev32 v3.8h, v3.8h - add v4.4s, v3.4s, v4.4s - eor v5.16b, v4.16b, v5.16b - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - ushr v6.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v6.16b - ld2 { v6.4s, v7.4s }, [x8] - add v4.4s, v3.4s, v4.4s - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v2.4s, v6.4s - eor v5.16b, v4.16b, v5.16b - ext v4.16b, v4.16b, v4.16b, #4 - ext v6.16b, v6.16b, v6.16b, #12 - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v16.16b - ext v16.16b, v7.16b, v7.16b, #12 - add v2.4s, v2.4s, v5.4s - mov v7.16b, v16.16b - eor v3.16b, v3.16b, v2.16b - add v2.4s, v2.4s, v16.4s - mov v7.s[1], v6.s[2] - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v5.16b, v4.16b, v5.16b - ushr v17.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v17.16b - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - ushr v17.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v17.16b - ext v17.16b, v18.16b, v1.16b, #8 - add v4.4s, v3.4s, v4.4s - uzp2 v17.4s, v17.4s, v0.4s - ext v3.16b, v3.16b, v3.16b, #8 - eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v17.4s - ext v4.16b, v4.16b, v4.16b, #12 - ushr v18.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v18.16b - ext v18.16b, v1.16b, v1.16b, #12 - add v2.4s, v2.4s, v5.4s - ext v1.16b, v1.16b, v18.16b, #12 - zip1 v18.2d, v16.2d, v0.2d - zip2 v0.4s, v0.4s, v16.4s - eor v3.16b, v3.16b, v2.16b - rev64 v1.4s, v1.4s - mov v18.s[3], v6.s[3] - zip1 v16.4s, v0.4s, v6.4s - rev32 v3.8h, v3.8h - trn2 v1.4s, v1.4s, v7.4s - zip1 v0.4s, v6.4s, v0.4s - add v4.4s, v4.4s, v3.4s - add v2.4s, v2.4s, v1.4s - ext v6.16b, v0.16b, v16.16b, #8 - eor v5.16b, v4.16b, v5.16b - ushr v7.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v7.16b - add v7.4s, v2.4s, v5.4s - eor v2.16b, v7.16b, v3.16b - ext v7.16b, v7.16b, v7.16b, #12 - ushr v3.4s, v2.4s, #8 - shl v2.4s, v2.4s, #24 - orr v3.16b, v2.16b, v3.16b - ext v2.16b, v18.16b, v18.16b, #12 - add v4.4s, v3.4s, v4.4s - uzp1 v2.4s, v18.4s, v2.4s - ext v3.16b, v3.16b, v3.16b, #8 - eor v5.16b, v4.16b, v5.16b - add v7.4s, v7.4s, v2.4s - ext v4.16b, v4.16b, v4.16b, #4 - ushr v18.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v18.16b - add v7.4s, v7.4s, v5.4s - eor v3.16b, v3.16b, v7.16b - add v7.4s, v7.4s, v6.4s - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v5.16b, v4.16b, v5.16b - ushr v0.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v0.16b, v5.16b, v0.16b - add v5.4s, v7.4s, v0.4s - ext v7.16b, v17.16b, v17.16b, #4 - eor v3.16b, v5.16b, v3.16b - uzp1 v17.4s, v7.4s, v7.4s - ushr v16.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b - ext v16.16b, v17.16b, v7.16b, #8 - add v4.4s, v3.4s, v4.4s - uzp2 v16.4s, v16.4s, v1.4s - ext v3.16b, v3.16b, v3.16b, #8 - eor v0.16b, v4.16b, v0.16b - add v5.4s, v5.4s, v16.4s - ext v4.16b, v4.16b, v4.16b, #12 - ushr v17.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v0.16b, v0.16b, v17.16b - ext v17.16b, v7.16b, v7.16b, #12 - add v5.4s, v5.4s, v0.4s - ext v7.16b, v7.16b, v17.16b, #12 - mov v17.16b, v6.16b - eor v3.16b, v3.16b, v5.16b - rev64 v7.4s, v7.4s - mov v17.s[1], v2.s[2] - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v18.16b, v4.16b, v0.16b - trn2 v0.4s, v7.4s, v17.4s - ushr v7.4s, v18.4s, #12 - shl v17.4s, v18.4s, #20 - add v5.4s, v5.4s, v0.4s - zip1 v18.2d, v6.2d, v1.2d - zip2 v1.4s, v1.4s, v6.4s - orr v7.16b, v17.16b, v7.16b - mov v18.s[3], v2.s[3] - zip1 v6.4s, v1.4s, v2.4s - add v5.4s, v5.4s, v7.4s - zip1 v1.4s, v2.4s, v1.4s - eor v3.16b, v5.16b, v3.16b - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v1.16b, v6.16b, #8 - ushr v17.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v17.16b, v3.16b, v17.16b - ext v3.16b, v18.16b, v18.16b, #12 - add v4.4s, v17.4s, v4.4s - uzp1 v3.4s, v18.4s, v3.4s - ext v17.16b, v17.16b, v17.16b, #8 - eor v7.16b, v4.16b, v7.16b - add v5.4s, v5.4s, v3.4s - ext v4.16b, v4.16b, v4.16b, #4 - ushr v18.4s, v7.4s, #7 - shl v7.4s, v7.4s, #25 - orr v7.16b, v7.16b, v18.16b - add v5.4s, v5.4s, v7.4s - eor v17.16b, v17.16b, v5.16b - add v5.4s, v5.4s, v6.4s - rev32 v17.8h, v17.8h - add v4.4s, v4.4s, v17.4s - eor v2.16b, v4.16b, v7.16b - ext v7.16b, v16.16b, v16.16b, #4 - ushr v1.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v1.16b, v2.16b, v1.16b - add v2.4s, v5.4s, v1.4s - eor v5.16b, v2.16b, v17.16b - uzp1 v17.4s, v7.4s, v7.4s - ushr v16.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - orr v5.16b, v5.16b, v16.16b - ext v16.16b, v17.16b, v7.16b, #8 - add v4.4s, v5.4s, v4.4s - uzp2 v16.4s, v16.4s, v0.4s - ext v5.16b, v5.16b, v5.16b, #8 - eor v1.16b, v4.16b, v1.16b - add v2.4s, v2.4s, v16.4s - ext v4.16b, v4.16b, v4.16b, #12 - ushr v17.4s, v1.4s, #7 - shl v1.4s, v1.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v1.16b, v1.16b, v17.16b - ext v17.16b, v7.16b, v7.16b, #12 - add v2.4s, v2.4s, v1.4s - ext v7.16b, v7.16b, v17.16b, #12 - mov v17.16b, v6.16b - eor v5.16b, v5.16b, v2.16b - rev64 v7.4s, v7.4s - mov v17.s[1], v3.s[2] - rev32 v5.8h, v5.8h - add v4.4s, v4.4s, v5.4s - eor v18.16b, v4.16b, v1.16b - trn2 v1.4s, v7.4s, v17.4s - ushr v7.4s, v18.4s, #12 - shl v17.4s, v18.4s, #20 - add v2.4s, v2.4s, v1.4s - zip1 v18.2d, v6.2d, v0.2d - zip2 v0.4s, v0.4s, v6.4s - orr v7.16b, v17.16b, v7.16b - mov v18.s[3], v3.s[3] - add v2.4s, v2.4s, v7.4s - eor v5.16b, v2.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v17.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - orr v5.16b, v5.16b, v17.16b - add v17.4s, v5.4s, v4.4s - ext v4.16b, v18.16b, v18.16b, #12 - ext v5.16b, v5.16b, v5.16b, #8 - eor v7.16b, v17.16b, v7.16b - uzp1 v4.4s, v18.4s, v4.4s - ext v17.16b, v17.16b, v17.16b, #4 - ushr v18.4s, v7.4s, #7 - shl v7.4s, v7.4s, #25 - add v2.4s, v2.4s, v4.4s - orr v7.16b, v7.16b, v18.16b - add v2.4s, v2.4s, v7.4s - eor v5.16b, v5.16b, v2.16b - rev32 v5.8h, v5.8h - add v6.4s, v17.4s, v5.4s - zip1 v17.4s, v0.4s, v3.4s - zip1 v0.4s, v3.4s, v0.4s - eor v3.16b, v6.16b, v7.16b - ext v0.16b, v0.16b, v17.16b, #8 - ushr v7.4s, v3.4s, #12 + ldr q3, [x1, #16] + and v0.8b, v0.8b, v1.8b + mov v2.d[1], v0.d[0] + stp q3, q6, [x0, #16] + add v19.4s, v4.4s, v3.4s + str q2, [x0, #48] + ld2 { v0.4s, v1.4s }, [x8] + ldp q7, q5, [x2] + dup v4.4s, v1.s[3] + dup v18.4s, v0.s[3] + uzp1 v16.4s, v7.4s, v0.4s + dup v17.4s, v7.s[1] + ext v23.16b, v4.16b, v4.16b, #4 + mov v17.s[1], v7.s[3] + ext v21.16b, v18.16b, v18.16b, #4 + zip1 v20.4s, v16.4s, v5.4s + dup v7.4s, v16.s[1] + mov v16.s[0], v5.s[0] + uzp2 v22.4s, v17.4s, v5.4s + uzp1 v5.4s, v20.4s, v5.4s + ext v20.16b, v23.16b, v1.16b, #12 + ext v21.16b, v21.16b, v0.16b, #12 + mov v22.d[0], v17.d[0] + add v19.4s, v19.4s, v5.4s + dup v23.4s, v5.s[3] + eor v24.16b, v19.16b, v2.16b + uzp1 v5.4s, v23.4s, v5.4s + rev32 v23.8h, v24.8h + uzp2 v2.4s, v16.4s, v22.4s + add v25.4s, v19.4s, v22.4s + zip2 v22.4s, v22.4s, v20.4s + add v6.4s, v23.4s, v6.4s + mov v2.d[0], v16.d[0] + eor v3.16b, v6.16b, v3.16b + ushr v16.4s, v3.4s, #12 shl v3.4s, v3.4s, #20 - add v2.4s, v2.4s, v0.4s - orr v3.16b, v3.16b, v7.16b - ext v7.16b, v16.16b, v16.16b, #4 - add v2.4s, v2.4s, v3.4s - uzp1 v17.4s, v7.4s, v7.4s - eor v5.16b, v2.16b, v5.16b - ushr v16.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - orr v5.16b, v5.16b, v16.16b - ext v16.16b, v17.16b, v7.16b, #8 - add v6.4s, v5.4s, v6.4s - uzp2 v16.4s, v16.4s, v1.4s - ext v5.16b, v5.16b, v5.16b, #8 + dup v19.4s, v21.s[2] + zip1 v18.4s, v18.4s, v22.4s + orr v3.16b, v3.16b, v16.16b + add v16.4s, v25.4s, v3.4s + zip1 v17.2d, v20.2d, v17.2d + eor v22.16b, v16.16b, v23.16b + dup v23.4s, v16.s[3] + ushr v25.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v23.16b, v23.16b, v23.16b, #4 + orr v22.16b, v22.16b, v25.16b + add v6.4s, v22.4s, v6.4s + ext v25.16b, v22.16b, v22.16b, #8 + ext v16.16b, v23.16b, v16.16b, #12 eor v3.16b, v6.16b, v3.16b - add v2.4s, v2.4s, v16.4s - ext v6.16b, v6.16b, v6.16b, #12 - ushr v17.4s, v3.4s, #7 + dup v22.4s, v22.s[2] + ushr v27.4s, v3.4s, #7 shl v3.4s, v3.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v3.16b, v3.16b, v17.16b - add v17.4s, v2.4s, v3.4s - eor v2.16b, v5.16b, v17.16b - ext v5.16b, v7.16b, v7.16b, #12 - rev32 v18.8h, v2.8h - ext v2.16b, v7.16b, v5.16b, #12 - mov v5.16b, v0.16b - add v6.4s, v6.4s, v18.4s - rev64 v2.4s, v2.4s - mov v5.s[1], v4.s[2] - eor v3.16b, v6.16b, v3.16b - trn2 v2.4s, v2.4s, v5.4s - ushr v5.4s, v3.4s, #12 + ext v23.16b, v6.16b, v6.16b, #4 + dup v6.4s, v6.s[1] + mov v25.s[0], v22.s[0] + add v16.4s, v16.4s, v21.4s + orr v3.16b, v3.16b, v27.16b + add v22.4s, v16.4s, v3.4s + mov v23.s[0], v6.s[0] + eor v6.16b, v25.16b, v22.16b + rev32 v6.8h, v6.8h + mov v19.s[3], v20.s[3] + add v20.4s, v22.4s, v20.4s + mov v26.16b, v17.16b + add v23.4s, v23.4s, v6.4s + mov v26.s[3], v21.s[3] + eor v3.16b, v23.16b, v3.16b + ushr v27.4s, v3.4s, #12 shl v3.4s, v3.4s, #20 - add v7.4s, v17.4s, v2.4s - orr v3.16b, v3.16b, v5.16b - add v5.4s, v7.4s, v3.4s - eor v7.16b, v5.16b, v18.16b - zip1 v18.2d, v0.2d, v1.2d - ext v5.16b, v5.16b, v5.16b, #12 - zip2 v0.4s, v1.4s, v0.4s + ext v21.16b, v2.16b, v2.16b, #4 + orr v3.16b, v3.16b, v27.16b + add v20.4s, v20.4s, v3.4s + rev64 v24.4s, v5.4s + eor v6.16b, v20.16b, v6.16b + ext v22.16b, v20.16b, v20.16b, #4 + ushr v27.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + dup v20.4s, v20.s[1] + mov v25.16b, v21.16b + orr v6.16b, v6.16b, v27.16b + mov v25.s[0], v7.s[0] + add v23.4s, v6.4s, v23.4s + mov v22.s[0], v20.s[0] + ext v20.16b, v6.16b, v6.16b, #8 + eor v3.16b, v23.16b, v3.16b + dup v6.4s, v6.s[2] + dup v28.4s, v23.s[3] + ushr v29.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + dup v27.4s, v25.s[2] + mov v20.s[0], v6.s[0] + ext v6.16b, v28.16b, v28.16b, #4 + add v22.4s, v22.4s, v25.4s + orr v28.16b, v3.16b, v29.16b + add v22.4s, v22.4s, v28.4s + ext v3.16b, v6.16b, v23.16b, #12 + eor v6.16b, v20.16b, v22.16b + rev32 v6.8h, v6.8h + mov v27.s[1], v21.s[1] + trn2 v16.4s, v24.4s, v19.4s + ext v24.16b, v26.16b, v26.16b, #12 + add v23.4s, v3.4s, v6.4s + uzp2 v3.4s, v27.4s, v19.4s + eor v19.16b, v23.16b, v28.16b + uzp1 v21.4s, v26.4s, v24.4s + ushr v24.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + add v22.4s, v22.4s, v16.4s + orr v19.16b, v19.16b, v24.16b + add v22.4s, v22.4s, v19.4s + mov v3.d[0], v27.d[0] + dup v24.4s, v22.s[3] + eor v6.16b, v22.16b, v6.16b + ext v24.16b, v24.16b, v24.16b, #4 + ushr v26.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + ext v20.16b, v18.16b, v0.16b, #4 + orr v6.16b, v6.16b, v26.16b + add v23.4s, v6.4s, v23.4s + ext v26.16b, v6.16b, v6.16b, #8 + ext v22.16b, v24.16b, v22.16b, #12 + eor v19.16b, v23.16b, v19.16b + dup v6.4s, v6.s[2] + ushr v27.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + mov v26.s[0], v6.s[0] + add v6.4s, v22.4s, v21.4s + ext v24.16b, v23.16b, v23.16b, #4 + orr v19.16b, v19.16b, v27.16b + add v22.4s, v6.4s, v19.4s + dup v23.4s, v23.s[1] + eor v6.16b, v26.16b, v22.16b + mov v24.s[0], v23.s[0] + rev32 v23.8h, v6.8h + ext v6.16b, v20.16b, v18.16b, #8 + ext v20.16b, v3.16b, v3.16b, #4 + mov v2.s[2], v7.s[0] + add v18.4s, v24.4s, v23.4s + rev64 v24.4s, v6.4s + eor v26.16b, v18.16b, v19.16b + dup v6.4s, v25.s[1] + ushr v25.4s, v26.4s, #12 + shl v26.4s, v26.4s, #20 + add v22.4s, v22.4s, v24.4s + dup v7.4s, v21.s[2] + orr v25.16b, v26.16b, v25.16b + add v22.4s, v22.4s, v25.4s + mov v19.16b, v20.16b + eor v23.16b, v22.16b, v23.16b + ext v26.16b, v22.16b, v22.16b, #4 + ushr v27.4s, v23.4s, #8 + shl v23.4s, v23.4s, #24 + dup v22.4s, v22.s[1] + mov v19.s[0], v6.s[0] + orr v23.16b, v23.16b, v27.16b + add v18.4s, v23.4s, v18.4s + mov v26.s[0], v22.s[0] + ext v22.16b, v23.16b, v23.16b, #8 + eor v25.16b, v18.16b, v25.16b + dup v23.4s, v23.s[2] + ushr v27.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + dup v28.4s, v18.s[3] + mov v22.s[0], v23.s[0] + add v26.4s, v26.4s, v19.4s + ext v23.16b, v28.16b, v28.16b, #4 + orr v25.16b, v25.16b, v27.16b + add v26.4s, v26.4s, v25.4s + mov v7.s[3], v24.s[3] + eor v22.16b, v22.16b, v26.16b + ext v18.16b, v23.16b, v18.16b, #12 + rev32 v22.8h, v22.8h + rev64 v23.4s, v2.4s + zip1 v16.2d, v24.2d, v16.2d + zip2 v24.4s, v5.4s, v24.4s + add v27.4s, v18.4s, v22.4s + trn2 v18.4s, v23.4s, v7.4s + eor v23.16b, v27.16b, v25.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + dup v5.4s, v17.s[2] + add v26.4s, v26.4s, v18.4s + orr v23.16b, v23.16b, v25.16b + add v25.4s, v26.4s, v23.4s + zip1 v4.4s, v4.4s, v24.4s + eor v22.16b, v25.16b, v22.16b + ushr v26.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + dup v29.4s, v25.s[3] + ext v24.16b, v4.16b, v5.16b, #4 + orr v22.16b, v22.16b, v26.16b + add v26.4s, v22.4s, v27.4s + mov v27.16b, v16.16b + mov v27.s[3], v21.s[3] + eor v23.16b, v26.16b, v23.16b + ushr v28.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + dup v17.4s, v26.s[1] + ext v4.16b, v24.16b, v4.16b, #8 + orr v21.16b, v23.16b, v28.16b + ext v23.16b, v29.16b, v29.16b, #4 + ext v28.16b, v22.16b, v22.16b, #8 + ext v29.16b, v27.16b, v27.16b, #12 + dup v22.4s, v22.s[2] + ext v23.16b, v23.16b, v25.16b, #12 + mov v28.s[0], v22.s[0] + uzp1 v22.4s, v27.4s, v29.4s + ext v25.16b, v26.16b, v26.16b, #4 + dup v26.4s, v19.s[2] + dup v19.4s, v19.s[1] + add v23.4s, v23.4s, v22.4s + mov v25.s[0], v17.s[0] + add v23.4s, v23.4s, v21.4s + mov v26.s[1], v20.s[1] + eor v17.16b, v28.16b, v23.16b + rev32 v17.8h, v17.8h + mov v3.s[2], v6.s[0] + dup v6.4s, v22.s[2] + add v24.4s, v25.4s, v17.4s + rev64 v25.4s, v4.4s + eor v21.16b, v24.16b, v21.16b + ushr v4.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + add v20.4s, v23.4s, v25.4s + orr v21.16b, v21.16b, v4.16b + uzp2 v4.4s, v26.4s, v7.4s + add v23.4s, v20.4s, v21.4s + mov v6.s[3], v25.s[3] + eor v7.16b, v23.16b, v17.16b + mov v4.d[0], v26.d[0] ushr v17.4s, v7.4s, #8 shl v7.4s, v7.4s, #24 - mov v18.s[3], v4.s[3] + ext v20.16b, v4.16b, v4.16b, #4 orr v7.16b, v7.16b, v17.16b - ext v17.16b, v18.16b, v18.16b, #12 - add v6.4s, v7.4s, v6.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v19.16b, v6.16b, v3.16b - uzp1 v3.4s, v18.4s, v17.4s - ext v6.16b, v6.16b, v6.16b, #4 - ushr v17.4s, v19.4s, #7 - shl v18.4s, v19.4s, #25 - add v5.4s, v5.4s, v3.4s - orr v17.16b, v18.16b, v17.16b - add v5.4s, v5.4s, v17.4s - eor v7.16b, v7.16b, v5.16b - rev32 v7.8h, v7.8h - add v1.4s, v6.4s, v7.4s - zip1 v6.4s, v0.4s, v4.4s - zip1 v0.4s, v4.4s, v0.4s - eor v4.16b, v1.16b, v17.16b - ext v6.16b, v0.16b, v6.16b, #8 - ushr v0.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v5.4s, v5.4s, v6.4s - zip1 v20.2d, v6.2d, v2.2d - orr v0.16b, v4.16b, v0.16b - mov v20.s[3], v3.s[3] - add v4.4s, v5.4s, v0.4s - eor v5.16b, v4.16b, v7.16b - ext v7.16b, v16.16b, v16.16b, #4 - ushr v16.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - uzp1 v17.4s, v7.4s, v7.4s - orr v5.16b, v5.16b, v16.16b - ext v16.16b, v17.16b, v7.16b, #8 - add v1.4s, v5.4s, v1.4s - uzp2 v16.4s, v16.4s, v2.4s - zip2 v2.4s, v2.4s, v6.4s - eor v0.16b, v1.16b, v0.16b - add v4.4s, v4.4s, v16.4s - ext v1.16b, v1.16b, v1.16b, #12 - ext v16.16b, v16.16b, v16.16b, #4 - ushr v17.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v17.16b, v0.16b, v17.16b - ext v0.16b, v5.16b, v5.16b, #8 - ext v5.16b, v7.16b, v7.16b, #12 - add v4.4s, v4.4s, v17.4s - eor v0.16b, v0.16b, v4.16b - rev32 v18.8h, v0.8h - ext v0.16b, v7.16b, v5.16b, #12 - mov v5.16b, v6.16b - add v7.4s, v1.4s, v18.4s + ext v17.16b, v23.16b, v23.16b, #4 + add v24.4s, v7.4s, v24.4s + dup v23.4s, v23.s[1] + eor v26.16b, v24.16b, v21.16b + dup v28.4s, v24.s[3] + mov v21.16b, v20.16b + mov v17.s[0], v23.s[0] + ext v23.16b, v7.16b, v7.16b, #8 + mov v21.s[0], v19.s[0] + dup v7.4s, v7.s[2] + ushr v27.4s, v26.4s, #7 + shl v26.4s, v26.4s, #25 + mov v23.s[0], v7.s[0] + add v17.4s, v17.4s, v21.4s + ext v7.16b, v28.16b, v28.16b, #4 + orr v26.16b, v26.16b, v27.16b + add v17.4s, v17.4s, v26.4s + mov v4.s[2], v19.s[0] + eor v23.16b, v23.16b, v17.16b + ext v7.16b, v7.16b, v24.16b, #12 + rev32 v23.8h, v23.8h + rev64 v24.4s, v3.4s + add v27.4s, v7.4s, v23.4s + trn2 v7.4s, v24.4s, v6.4s + eor v24.16b, v27.16b, v26.16b + ushr v26.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + add v17.4s, v17.4s, v7.4s + orr v24.16b, v24.16b, v26.16b + add v26.4s, v17.4s, v24.4s + eor v17.16b, v26.16b, v23.16b + dup v29.4s, v26.s[3] + ushr v23.4s, v17.4s, #8 + shl v28.4s, v17.4s, #24 + zip1 v17.2d, v25.2d, v18.2d + orr v18.16b, v28.16b, v23.16b + add v23.4s, v18.4s, v27.4s + zip2 v25.4s, v2.4s, v25.4s + mov v27.16b, v17.16b + mov v27.s[3], v22.s[3] + eor v24.16b, v23.16b, v24.16b + ushr v28.4s, v24.4s, #7 + shl v24.4s, v24.4s, #25 + dup v2.4s, v16.s[2] + dup v16.4s, v23.s[1] + orr v22.16b, v24.16b, v28.16b + ext v24.16b, v29.16b, v29.16b, #4 + ext v28.16b, v18.16b, v18.16b, #8 + ext v29.16b, v27.16b, v27.16b, #12 + dup v18.4s, v18.s[2] + ext v24.16b, v24.16b, v26.16b, #12 + mov v28.s[0], v18.s[0] + uzp1 v18.4s, v27.4s, v29.4s + ext v23.16b, v23.16b, v23.16b, #4 + zip1 v0.4s, v0.4s, v25.4s + add v24.4s, v24.4s, v18.4s + dup v19.4s, v18.s[2] + mov v23.s[0], v16.s[0] + add v24.4s, v24.4s, v22.4s + ext v16.16b, v0.16b, v2.16b, #4 + eor v1.16b, v28.16b, v24.16b + rev32 v25.8h, v1.8h + ext v0.16b, v16.16b, v0.16b, #8 + add v16.4s, v23.4s, v25.4s + dup v23.4s, v21.s[2] + mov v23.s[1], v20.s[1] + eor v22.16b, v16.16b, v22.16b rev64 v1.4s, v0.4s - mov v5.s[1], v3.s[2] - eor v17.16b, v7.16b, v17.16b - trn2 v1.4s, v1.4s, v5.4s - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - add v4.4s, v4.4s, v1.4s - orr v17.16b, v17.16b, v19.16b - add v19.4s, v4.4s, v17.4s - eor v4.16b, v19.16b, v18.16b - ext v19.16b, v19.16b, v19.16b, #12 - ushr v18.4s, v4.4s, #8 - shl v4.4s, v4.4s, #24 - orr v18.16b, v4.16b, v18.16b - ext v4.16b, v20.16b, v20.16b, #12 - add v7.4s, v18.4s, v7.4s - uzp1 v4.4s, v20.4s, v4.4s - ext v18.16b, v18.16b, v18.16b, #8 - eor v17.16b, v7.16b, v17.16b - add v19.4s, v19.4s, v4.4s - ext v7.16b, v7.16b, v7.16b, #4 - ushr v20.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - orr v17.16b, v17.16b, v20.16b - add v19.4s, v19.4s, v17.4s - eor v18.16b, v18.16b, v19.16b - rev32 v18.8h, v18.8h - add v6.4s, v7.4s, v18.4s - zip1 v7.4s, v2.4s, v3.4s - zip1 v2.4s, v3.4s, v2.4s - eor v3.16b, v6.16b, v17.16b - ext v2.16b, v2.16b, v7.16b, #8 - ushr v7.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v17.4s, v19.4s, v2.4s - zip1 v1.2d, v2.2d, v1.2d - zip2 v0.4s, v0.4s, v2.4s - orr v3.16b, v3.16b, v7.16b - mov v1.s[3], v4.s[3] - add v7.4s, v17.4s, v3.4s - eor v17.16b, v7.16b, v18.16b - ext v7.16b, v7.16b, v7.16b, #4 - ushr v18.4s, v17.4s, #8 + ushr v0.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + orr v22.16b, v22.16b, v0.16b + uzp2 v0.4s, v23.4s, v6.4s + add v20.4s, v24.4s, v1.4s + mov v19.s[3], v1.s[3] + add v24.4s, v20.4s, v22.4s + mov v0.d[0], v23.d[0] + eor v6.16b, v24.16b, v25.16b + ushr v20.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + ext v23.16b, v24.16b, v24.16b, #4 + orr v6.16b, v6.16b, v20.16b + ext v20.16b, v0.16b, v0.16b, #4 + add v25.4s, v6.4s, v16.4s + dup v16.4s, v24.s[1] + eor v24.16b, v25.16b, v22.16b + dup v27.4s, v25.s[3] + mov v23.s[0], v16.s[0] + dup v16.4s, v21.s[1] + mov v22.16b, v20.16b + ushr v26.4s, v24.4s, #7 + ext v21.16b, v6.16b, v6.16b, #8 + mov v22.s[0], v16.s[0] + dup v6.4s, v6.s[2] + shl v24.4s, v24.4s, #25 + mov v21.s[0], v6.s[0] + add v23.4s, v23.4s, v22.4s + ext v6.16b, v27.16b, v27.16b, #4 + orr v24.16b, v24.16b, v26.16b + add v23.4s, v23.4s, v24.4s + zip1 v7.2d, v1.2d, v7.2d + eor v21.16b, v21.16b, v23.16b + ext v6.16b, v6.16b, v25.16b, #12 + rev32 v21.8h, v21.8h + rev64 v25.4s, v4.4s + zip2 v1.4s, v3.4s, v1.4s + mov v0.s[2], v16.s[0] + add v26.4s, v6.4s, v21.4s + trn2 v6.4s, v25.4s, v19.4s + eor v24.16b, v26.16b, v24.16b + ushr v25.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + add v23.4s, v23.4s, v6.4s + orr v24.16b, v24.16b, v25.16b + add v23.4s, v23.4s, v24.4s + zip1 v5.4s, v5.4s, v1.4s + eor v21.16b, v23.16b, v21.16b + ushr v25.4s, v21.4s, #8 + shl v21.4s, v21.4s, #24 + dup v1.4s, v17.s[2] + orr v21.16b, v21.16b, v25.16b + add v25.4s, v21.4s, v26.4s + mov v26.16b, v7.16b + mov v26.s[3], v18.s[3] + eor v24.16b, v25.16b, v24.16b + dup v18.4s, v23.s[3] + ushr v27.4s, v24.4s, #7 + shl v24.4s, v24.4s, #25 + ext v18.16b, v18.16b, v18.16b, #4 + dup v17.4s, v25.s[1] + orr v24.16b, v24.16b, v27.16b + ext v27.16b, v26.16b, v26.16b, #12 + ext v28.16b, v21.16b, v21.16b, #8 + ext v23.16b, v18.16b, v23.16b, #12 + dup v21.4s, v21.s[2] + uzp1 v18.4s, v26.4s, v27.4s + mov v28.s[0], v21.s[0] + ext v3.16b, v25.16b, v25.16b, #4 + add v21.4s, v23.4s, v18.4s + add v23.4s, v21.4s, v24.4s + ext v21.16b, v5.16b, v1.16b, #4 + mov v3.s[0], v17.s[0] + eor v17.16b, v28.16b, v23.16b + rev32 v17.8h, v17.8h + ext v5.16b, v21.16b, v5.16b, #8 + add v25.4s, v3.4s, v17.4s + dup v3.4s, v22.s[2] + rev64 v21.4s, v5.4s + eor v5.16b, v25.16b, v24.16b + mov v24.16b, v3.16b + mov v24.s[1], v20.s[1] + ushr v22.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v23.4s, v23.4s, v21.4s + orr v5.16b, v5.16b, v22.16b + add v22.4s, v23.4s, v5.4s + uzp2 v23.4s, v24.4s, v19.4s + eor v17.16b, v22.16b, v17.16b + ushr v20.4s, v17.4s, #8 shl v17.4s, v17.4s, #24 - orr v17.16b, v17.16b, v18.16b - ext v18.16b, v16.16b, v16.16b, #8 - add v6.4s, v17.4s, v6.4s - uzp2 v5.4s, v18.4s, v5.4s - eor v3.16b, v6.16b, v3.16b - ext v5.16b, v5.16b, v18.16b, #4 - ext v6.16b, v6.16b, v6.16b, #12 - ushr v18.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - add v5.4s, v7.4s, v5.4s - ext v7.16b, v17.16b, v17.16b, #8 - ext v17.16b, v16.16b, v16.16b, #12 - orr v3.16b, v3.16b, v18.16b - ext v16.16b, v16.16b, v17.16b, #12 - add v5.4s, v3.4s, v5.4s - mov v17.16b, v2.16b - rev64 v16.4s, v16.4s - eor v7.16b, v7.16b, v5.16b - mov v17.s[1], v4.s[2] + mov v23.d[0], v24.d[0] + ext v24.16b, v22.16b, v22.16b, #4 + orr v20.16b, v17.16b, v20.16b + dup v17.4s, v22.s[1] + ext v19.16b, v23.16b, v23.16b, #4 + mov v24.s[0], v17.s[0] + dup v17.4s, v23.s[1] + add v25.4s, v20.4s, v25.4s + ext v22.16b, v20.16b, v20.16b, #8 + mov v19.s[0], v17.s[0] + dup v20.4s, v20.s[2] + dup v26.4s, v25.s[3] + eor v5.16b, v25.16b, v5.16b + ushr v23.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + mov v22.s[0], v20.s[0] + ext v20.16b, v26.16b, v26.16b, #4 + add v24.4s, v24.4s, v19.4s + orr v23.16b, v5.16b, v23.16b + add v24.4s, v24.4s, v23.4s + ext v5.16b, v20.16b, v25.16b, #12 + dup v20.4s, v18.s[2] + eor v22.16b, v22.16b, v24.16b + rev32 v16.8h, v22.8h + mov v20.s[3], v21.s[3] + rev64 v22.4s, v0.4s + zip1 v6.2d, v21.2d, v6.2d + add v25.4s, v5.4s, v16.4s + zip2 v4.4s, v4.4s, v21.4s + trn2 v5.4s, v22.4s, v20.4s + eor v22.16b, v25.16b, v23.16b + ushr v23.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v24.4s, v24.4s, v5.4s + orr v22.16b, v22.16b, v23.16b + add v23.4s, v24.4s, v22.4s + zip1 v2.4s, v2.4s, v4.4s + eor v16.16b, v23.16b, v16.16b + ushr v24.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + dup v27.4s, v23.s[3] + dup v4.4s, v7.s[2] + orr v16.16b, v16.16b, v24.16b + add v24.4s, v16.4s, v25.4s + mov v25.16b, v6.16b + mov v25.s[3], v18.s[3] + eor v22.16b, v24.16b, v22.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + dup v7.4s, v24.s[1] + ext v18.16b, v27.16b, v27.16b, #4 + dup v27.4s, v16.s[2] + orr v22.16b, v22.16b, v26.16b + ext v26.16b, v25.16b, v25.16b, #12 + ext v21.16b, v24.16b, v24.16b, #4 + ext v18.16b, v18.16b, v23.16b, #12 + ext v23.16b, v16.16b, v16.16b, #8 + uzp1 v16.4s, v25.4s, v26.4s + mov v21.s[0], v7.s[0] + ext v4.16b, v2.16b, v4.16b, #4 + mov v23.s[0], v27.s[0] + add v18.4s, v18.4s, v16.4s + mov v3.s[2], v17.s[0] + add v18.4s, v18.4s, v22.4s + ext v2.16b, v4.16b, v2.16b, #8 + eor v7.16b, v23.16b, v18.16b rev32 v7.8h, v7.8h - trn2 v16.4s, v16.4s, v17.4s - add v6.4s, v6.4s, v7.4s - add v5.4s, v5.4s, v16.4s - eor v3.16b, v6.16b, v3.16b - ushr v17.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v17.16b - add v5.4s, v5.4s, v3.4s - eor v7.16b, v5.16b, v7.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v16.4s, v7.4s, #8 + dup v17.4s, v16.s[2] + rev64 v2.4s, v2.4s + rev64 v3.4s, v3.4s + add v4.4s, v21.4s, v7.4s + eor v21.16b, v4.16b, v22.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + add v18.4s, v18.4s, v2.4s + orr v21.16b, v21.16b, v22.16b + dup v22.4s, v19.s[2] + add v18.4s, v18.4s, v21.4s + zip1 v22.4s, v20.4s, v22.4s + eor v7.16b, v18.16b, v7.16b + ushr v23.4s, v7.4s, #8 shl v7.4s, v7.4s, #24 - orr v7.16b, v7.16b, v16.16b - ext v16.16b, v1.16b, v1.16b, #12 - add v6.4s, v7.4s, v6.4s - uzp1 v1.4s, v1.4s, v16.4s - eor v3.16b, v6.16b, v3.16b - add v1.4s, v5.4s, v1.4s - ext v5.16b, v7.16b, v7.16b, #8 - ext v6.16b, v6.16b, v6.16b, #4 - ushr v16.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - orr v3.16b, v3.16b, v16.16b - add v1.4s, v1.4s, v3.4s - eor v5.16b, v5.16b, v1.16b - rev32 v5.8h, v5.8h - add v2.4s, v6.4s, v5.4s - zip1 v6.4s, v0.4s, v4.4s - zip1 v0.4s, v4.4s, v0.4s - eor v3.16b, v2.16b, v3.16b - ext v0.16b, v0.16b, v6.16b, #8 - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v0.4s, v1.4s, v0.4s - orr v1.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v1.4s - eor v3.16b, v0.16b, v5.16b - ext v0.16b, v0.16b, v0.16b, #4 + uzp2 v20.4s, v22.4s, v20.4s + ext v22.16b, v18.16b, v18.16b, #4 + dup v18.4s, v18.s[1] + orr v7.16b, v7.16b, v23.16b + ext v20.16b, v20.16b, v20.16b, #4 + mov v22.s[0], v18.s[0] + dup v18.4s, v19.s[1] + add v4.4s, v7.4s, v4.4s + ext v19.16b, v7.16b, v7.16b, #8 + mov v20.s[0], v18.s[0] + eor v21.16b, v4.16b, v21.16b + dup v7.4s, v7.s[2] + dup v23.4s, v4.s[3] + ushr v18.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + mov v19.s[0], v7.s[0] + ext v7.16b, v23.16b, v23.16b, #4 + add v20.4s, v22.4s, v20.4s + orr v18.16b, v21.16b, v18.16b + add v20.4s, v20.4s, v18.4s + ext v4.16b, v7.16b, v4.16b, #12 + eor v7.16b, v19.16b, v20.16b + rev32 v7.8h, v7.8h + mov v17.s[3], v2.s[3] + zip1 v5.2d, v2.2d, v5.2d + zip2 v0.4s, v0.4s, v2.4s + add v4.4s, v4.4s, v7.4s + trn2 v3.4s, v3.4s, v17.4s + eor v17.16b, v4.16b, v18.16b + ushr v18.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v3.4s, v20.4s, v3.4s + orr v17.16b, v17.16b, v18.16b + add v3.4s, v3.4s, v17.4s + mov v5.s[3], v16.s[3] + eor v7.16b, v3.16b, v7.16b + ushr v18.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + dup v19.4s, v3.s[3] + zip1 v0.4s, v1.4s, v0.4s + dup v1.4s, v6.s[2] + orr v7.16b, v7.16b, v18.16b + add v4.4s, v7.4s, v4.4s + ext v16.16b, v19.16b, v19.16b, #4 + eor v17.16b, v4.16b, v17.16b + ushr v18.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v3.16b, v16.16b, v3.16b, #12 + orr v17.16b, v17.16b, v18.16b + ext v18.16b, v5.16b, v5.16b, #12 + ext v16.16b, v7.16b, v7.16b, #8 + dup v7.4s, v7.s[2] + ext v2.16b, v4.16b, v4.16b, #4 + dup v4.4s, v4.s[1] + uzp1 v5.4s, v5.4s, v18.4s + mov v16.s[0], v7.s[0] + ext v1.16b, v0.16b, v1.16b, #4 + mov v2.s[0], v4.s[0] + add v3.4s, v3.4s, v5.4s + add v3.4s, v3.4s, v17.4s + ext v0.16b, v1.16b, v0.16b, #8 + eor v4.16b, v16.16b, v3.16b + rev32 v4.8h, v4.8h + rev64 v0.4s, v0.4s + add v1.4s, v2.4s, v4.4s + eor v2.16b, v1.16b, v17.16b + ushr v5.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v0.4s, v3.4s, v0.4s + orr v2.16b, v2.16b, v5.16b + add v0.4s, v0.4s, v2.4s + eor v3.16b, v0.16b, v4.16b ushr v4.4s, v3.4s, #8 shl v3.4s, v3.4s, #24 orr v3.16b, v3.16b, v4.16b - add v2.4s, v3.4s, v2.4s + add v1.4s, v3.4s, v1.4s + ext v4.16b, v0.16b, v0.16b, #4 + dup v7.4s, v1.s[3] + dup v6.4s, v3.s[2] + dup v0.4s, v0.s[1] ext v3.16b, v3.16b, v3.16b, #8 - eor v1.16b, v2.16b, v1.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v4.4s, v1.4s, #7 - shl v1.4s, v1.4s, #25 - stp q2, q3, [x0, #32] - orr v1.16b, v1.16b, v4.16b - stp q0, q1, [x0] + ext v7.16b, v7.16b, v7.16b, #4 + eor v2.16b, v1.16b, v2.16b + mov v4.s[0], v0.s[0] + mov v3.s[0], v6.s[0] + ext v0.16b, v7.16b, v1.16b, #12 + ushr v5.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v5.16b + stp q0, q3, [x0, #32] + stp q4, q2, [x0] ret .Lfunc_end1: .size compress_pre, .Lfunc_end1-compress_pre @@ -554,12 +670,11 @@ compress_pre: .type zfs_blake3_compress_xof_sse2,@function zfs_blake3_compress_xof_sse2: .cfi_startproc - hint #25 - .cfi_negate_ra_state sub sp, sp, #96 + .cfi_def_cfa_offset 96 stp x29, x30, [sp, #64] - add x29, sp, #64 stp x20, x19, [sp, #80] + add x29, sp, #64 .cfi_def_cfa w29, 32 .cfi_offset w19, -8 .cfi_offset w20, -16 @@ -578,7 +693,6 @@ zfs_blake3_compress_xof_sse2: ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b eor v1.16b, v3.16b, v1.16b - ldp x29, x30, [sp, #64] stp q0, q1, [x19] ldr q0, [x20] eor v0.16b, v0.16b, v2.16b @@ -586,16 +700,22 @@ zfs_blake3_compress_xof_sse2: ldr q0, [x20, #16] eor v0.16b, v0.16b, v3.16b str q0, [x19, #48] + .cfi_def_cfa wsp, 96 ldp x20, x19, [sp, #80] + ldp x29, x30, [sp, #64] add sp, sp, #96 - hint #29 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w20 + .cfi_restore w30 + .cfi_restore w29 ret .Lfunc_end2: .size zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 + .p2align 4, 0x0 .LCPI3_0: .word 0 .word 1 @@ -607,20 +727,18 @@ zfs_blake3_compress_xof_sse2: .type zfs_blake3_hash_many_sse2,@function zfs_blake3_hash_many_sse2: .cfi_startproc - hint #25 - .cfi_negate_ra_state stp d15, d14, [sp, #-160]! + .cfi_def_cfa_offset 160 stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] stp x29, x30, [sp, #64] - add x29, sp, #64 stp x28, x27, [sp, #80] stp x26, x25, [sp, #96] stp x24, x23, [sp, #112] stp x22, x21, [sp, #128] stp x20, x19, [sp, #144] - sub sp, sp, #464 + add x29, sp, #64 .cfi_def_cfa w29, 96 .cfi_offset w19, -8 .cfi_offset w20, -16 @@ -642,57 +760,58 @@ zfs_blake3_hash_many_sse2: .cfi_offset b13, -144 .cfi_offset b14, -152 .cfi_offset b15, -160 + sub sp, sp, #480 mov w19, w6 mov x20, x4 - mov x24, x1 + mov x23, x1 + mov x24, x0 ldr x26, [x29, #104] - ldrb w27, [x29, #96] cmp x1, #4 + ldrb w8, [x29, #96] str x3, [sp, #40] b.lo .LBB3_6 - adrp x8, .LCPI3_0 - sbfx w9, w5, #0, #1 - mov w10, #44677 - mov w11, #62322 - movk w10, #47975, lsl #16 - movk w11, #15470, lsl #16 - ldr q0, [x8, :lo12:.LCPI3_0] - dup v1.4s, w9 - mov w9, #58983 - orr w8, w7, w19 - movk w9, #27145, lsl #16 - and v0.16b, v1.16b, v0.16b + adrp x10, .LCPI3_0 + sbfx w11, w5, #0, #1 + mov w12, #62322 + orr w9, w7, w19 + movk w12, #15470, lsl #16 + ldr q0, [x10, :lo12:.LCPI3_0] + mov w10, #58983 + movk w10, #27145, lsl #16 dup v1.4s, w11 - movi v24.4s, #64 - dup v2.4s, w9 - mov w9, #62778 - movk w9, #42319, lsl #16 + mov w11, #44677 + movk w11, #47975, lsl #16 + dup v29.4s, w10 + mov w10, #62778 + movk w10, #42319, lsl #16 + and v0.16b, v1.16b, v0.16b + dup v26.4s, w11 + dup v1.4s, w12 str q0, [sp, #16] orr v0.4s, #128, lsl #24 - stp q2, q1, [sp, #48] + stp q26, q29, [sp, #64] str q0, [sp] - dup v0.4s, w10 - str q0, [sp, #80] + str q1, [sp, #48] b .LBB3_3 .LBB3_2: - zip1 v0.4s, v12.4s, v31.4s - add x10, x20, #4 - zip1 v1.4s, v29.4s, v30.4s + zip1 v0.4s, v7.4s, v8.4s + add x11, x20, #4 + zip1 v1.4s, v9.4s, v25.4s tst w5, #0x1 - zip1 v2.4s, v28.4s, v23.4s - csel x20, x10, x20, ne - zip1 v3.4s, v13.4s, v25.4s - add x0, x0, #32 - zip2 v6.4s, v12.4s, v31.4s - sub x24, x24, #4 + zip1 v2.4s, v24.4s, v13.4s + csel x20, x11, x20, ne + zip1 v3.4s, v14.4s, v11.4s + add x24, x24, #32 + zip2 v6.4s, v7.4s, v8.4s + sub x23, x23, #4 zip1 v4.2d, v0.2d, v1.2d - cmp x24, #3 - zip2 v7.4s, v29.4s, v30.4s + cmp x23, #3 + zip2 v7.4s, v9.4s, v25.4s zip1 v5.2d, v2.2d, v3.2d zip2 v0.2d, v0.2d, v1.2d zip2 v1.2d, v2.2d, v3.2d - zip2 v2.4s, v28.4s, v23.4s - zip2 v3.4s, v13.4s, v25.4s + zip2 v2.4s, v24.4s, v13.4s + zip2 v3.4s, v14.4s, v11.4s stp q4, q5, [x26] zip2 v4.2d, v6.2d, v7.2d stp q0, q1, [x26, #32] @@ -704,1316 +823,1334 @@ zfs_blake3_hash_many_sse2: add x26, x26, #128 b.ls .LBB3_6 .LBB3_3: - ldr x14, [sp, #40] - mov x10, x14 - add x11, x14, #8 - add x12, x14, #12 - add x13, x14, #16 - ld1r { v12.4s }, [x10], #4 - ld1r { v29.4s }, [x11] - add x11, x14, #20 - ld1r { v30.4s }, [x12] - add x12, x14, #24 - ld1r { v28.4s }, [x13] - ld1r { v23.4s }, [x11] - add x11, x14, #28 + ldr x15, [sp, #40] + mov x11, x15 + add x12, x15, #8 + add x13, x15, #12 + add x14, x15, #16 + ld1r { v7.4s }, [x11], #4 + ld1r { v9.4s }, [x12] + add x12, x15, #20 + ld1r { v25.4s }, [x13] + add x13, x15, #24 + ld1r { v24.4s }, [x14] ld1r { v13.4s }, [x12] - ld1r { v31.4s }, [x10] - ld1r { v25.4s }, [x11] + add x12, x15, #28 + ld1r { v14.4s }, [x13] + ld1r { v8.4s }, [x11] + ld1r { v11.4s }, [x12] cbz x2, .LBB3_2 ldr q1, [sp, #16] dup v0.4s, w20 - lsr x12, x20, #32 - mov x10, xzr - ldp x13, x14, [x0, #16] + lsr x15, x20, #32 + mov x11, xzr + ldp x12, x13, [x24] add v1.4s, v0.4s, v1.4s - mov x15, x2 + mov x16, x2 movi v0.4s, #128, lsl #24 - mov w4, w8 + mov w3, w9 str q1, [sp, #112] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w12 - ldp x11, x12, [x0] + dup v1.4s, w15 + ldp x14, x15, [x24, #16] sub v0.4s, v1.4s, v0.4s str q0, [sp, #96] .LBB3_5: - add x17, x11, x10 - add x21, x12, x10 - add x16, x13, x10 - add x6, x14, x10 - subs x15, x15, #1 - add x10, x10, #64 - ldp q0, q1, [x17] - csel w3, w27, wzr, eq - orr w3, w3, w4 - mov w4, w19 + subs x16, x16, #1 + add x1, x12, x11 + add x6, x13, x11 + add x17, x14, x11 + add x0, x15, x11 + csel w4, w8, wzr, eq + orr w3, w4, w3 + add x11, x11, #64 and w3, w3, #0xff - ldp q3, q6, [x21] - dup v2.4s, w3 - zip1 v21.4s, v0.4s, v3.4s - zip2 v19.4s, v0.4s, v3.4s - ldp q5, q7, [x16] - zip1 v17.4s, v1.4s, v6.4s - zip2 v22.4s, v1.4s, v6.4s - ldp q16, q18, [x6] - zip1 v4.4s, v5.4s, v16.4s - zip2 v0.4s, v5.4s, v16.4s - ldp q26, q27, [x17, #32] - zip1 v1.4s, v7.4s, v18.4s - zip2 v3.4s, v7.4s, v18.4s - zip2 v20.2d, v19.2d, v0.2d - mov v19.d[1], v0.d[0] - dup v18.4s, w9 - ldp q8, q9, [x21, #32] - stur q19, [x29, #-208] - zip2 v7.4s, v26.4s, v8.4s - zip1 v10.4s, v26.4s, v8.4s - ldp q11, q5, [x16, #32] - zip2 v26.2d, v17.2d, v1.2d - stp q7, q26, [sp, #192] - mov v17.d[1], v1.d[0] - add v1.4s, v23.4s, v31.4s - ldp q16, q6, [x6, #32] + ldp q0, q3, [x1] + mov v1.16b, v7.16b + dup v5.4s, w3 + mov w3, w19 + mov v23.16b, v26.16b + str q5, [sp, #192] + ldp q2, q4, [x6] + zip1 v21.4s, v0.4s, v2.4s + zip2 v19.4s, v0.4s, v2.4s + ldp q6, q16, [x17] + zip2 v22.4s, v3.4s, v4.4s + zip1 v17.4s, v3.4s, v4.4s + ldp q7, q18, [x0] + zip1 v5.4s, v6.4s, v7.4s + zip2 v2.4s, v6.4s, v7.4s + ldp q10, q6, [x17, #32] + zip1 v3.4s, v16.4s, v18.4s + zip2 v4.4s, v16.4s, v18.4s + zip2 v30.2d, v19.2d, v2.2d + mov v19.d[1], v2.d[0] + zip2 v28.2d, v17.2d, v3.2d + ldp q12, q7, [x0, #32] + mov v17.d[1], v3.d[0] + zip2 v31.2d, v22.2d, v4.2d + add v3.4s, v9.4s, v14.4s + stur q28, [x29, #-224] + add v2.4s, v8.4s, v13.4s stur q17, [x29, #-256] - add v1.4s, v1.4s, v19.4s - zip1 v8.4s, v11.4s, v16.4s - zip2 v7.4s, v11.4s, v16.4s - zip1 v11.4s, v27.4s, v9.4s - zip2 v9.4s, v27.4s, v9.4s - zip2 v27.2d, v21.2d, v4.2d - mov v21.d[1], v4.d[0] - str q7, [sp, #224] - add v4.4s, v28.4s, v12.4s - zip1 v15.4s, v5.4s, v6.4s - zip2 v14.4s, v5.4s, v6.4s - stur q27, [x29, #-192] - zip2 v16.2d, v22.2d, v3.2d - stp q20, q21, [x29, #-240] - add v0.4s, v4.4s, v21.4s - ldp q6, q4, [sp, #96] - mov v22.d[1], v3.d[0] - add v5.4s, v25.4s, v30.4s - add v3.4s, v13.4s, v29.4s - eor v6.16b, v1.16b, v6.16b - add v1.4s, v1.4s, v20.4s - str q22, [sp, #256] - eor v4.16b, v0.16b, v4.16b - add v5.4s, v5.4s, v22.4s - add v3.4s, v3.4s, v17.4s - ldr q17, [sp, #48] - rev32 v6.8h, v6.8h + ldp q20, q26, [x1, #32] + zip2 v15.4s, v6.4s, v7.4s + add v2.4s, v2.4s, v19.4s + ldp q27, q0, [x6, #32] + zip1 v18.4s, v20.4s, v27.4s + zip2 v16.4s, v20.4s, v27.4s + zip1 v20.4s, v6.4s, v7.4s + mov v6.16b, v22.16b + mov v6.d[1], v4.d[0] + add v4.4s, v25.4s, v11.4s + str q16, [sp, #240] + zip1 v27.4s, v10.4s, v12.4s + zip2 v16.4s, v10.4s, v12.4s + zip1 v12.4s, v26.4s, v0.4s + stur q6, [x29, #-192] + zip2 v10.4s, v26.4s, v0.4s + zip2 v26.2d, v21.2d, v5.2d + mov v21.d[1], v5.d[0] + stp q16, q18, [sp, #256] + add v5.4s, v3.4s, v17.4s + add v6.4s, v4.4s, v6.4s + ldp q4, q3, [sp, #96] + add v0.4s, v1.4s, v24.4s + dup v18.4s, w10 + movi v7.4s, #64 + str q21, [sp, #160] + add v0.4s, v0.4s, v21.4s + stur q26, [x29, #-240] + eor v4.16b, v2.16b, v4.16b rev32 v4.8h, v4.8h - eor v2.16b, v5.16b, v2.16b - eor v7.16b, v3.16b, v24.16b - add v0.4s, v0.4s, v27.4s - add v21.4s, v4.4s, v17.4s - rev32 v31.8h, v2.8h - ldr q2, [sp, #80] - rev32 v7.8h, v7.8h - mov v27.16b, v16.16b - eor v17.16b, v21.16b, v28.16b - add v29.4s, v6.4s, v2.4s - ldr q2, [sp, #64] - add v24.4s, v31.4s, v18.4s - str q27, [sp, #176] - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - add v30.4s, v7.4s, v2.4s - eor v18.16b, v29.16b, v23.16b - orr v12.16b, v17.16b, v19.16b - eor v17.16b, v30.16b, v13.16b - eor v19.16b, v24.16b, v25.16b - ushr v23.4s, v18.4s, #12 - shl v18.4s, v18.4s, #20 - ushr v25.4s, v17.4s, #12 + eor v3.16b, v0.16b, v3.16b + ldr q1, [sp, #192] + rev32 v3.8h, v3.8h + eor v7.16b, v5.16b, v7.16b + rev32 v16.8h, v7.8h + add v8.4s, v4.4s, v23.4s + add v29.4s, v3.4s, v29.4s + eor v1.16b, v6.16b, v1.16b + eor v7.16b, v29.16b, v24.16b + ushr v17.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + rev32 v1.8h, v1.8h + orr v23.16b, v7.16b, v17.16b + eor v7.16b, v8.16b, v13.16b + ushr v17.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + add v9.4s, v1.4s, v18.4s + orr v13.16b, v7.16b, v17.16b + ldr q7, [sp, #48] + add v2.4s, v2.4s, v30.4s + add v0.4s, v0.4s, v26.4s + add v25.4s, v16.4s, v7.4s + mov v22.16b, v19.16b + eor v7.16b, v25.16b, v14.16b + str q22, [sp, #224] + ushr v18.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + add v19.4s, v0.4s, v23.4s + orr v14.16b, v7.16b, v18.16b + add v18.4s, v2.4s, v13.4s + eor v17.16b, v9.16b, v11.16b + eor v2.16b, v18.16b, v4.16b + ushr v4.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + ushr v24.4s, v17.4s, #12 shl v17.4s, v17.4s, #20 - ushr v28.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v13.16b, v18.16b, v23.16b - orr v25.16b, v17.16b, v25.16b - orr v2.16b, v19.16b, v28.16b - add v28.4s, v0.4s, v12.4s - add v0.4s, v3.4s, v26.4s - add v18.4s, v1.4s, v13.4s - add v3.4s, v5.4s, v16.4s - eor v1.16b, v28.16b, v4.16b - add v17.4s, v0.4s, v25.4s - eor v0.16b, v18.16b, v6.16b - add v19.4s, v3.4s, v2.4s - ushr v16.4s, v1.4s, #8 - shl v3.4s, v1.4s, #24 - eor v4.16b, v17.16b, v7.16b - ushr v6.4s, v0.4s, #8 - shl v1.4s, v0.4s, #24 - eor v5.16b, v19.16b, v31.16b - ushr v23.4s, v4.4s, #8 - shl v4.4s, v4.4s, #24 - orr v7.16b, v3.16b, v16.16b - orr v6.16b, v1.16b, v6.16b - ushr v31.4s, v5.4s, #8 - shl v0.4s, v5.4s, #24 - orr v5.16b, v4.16b, v23.16b - add v4.4s, v7.4s, v21.4s - ldr q21, [sp, #192] - add v3.4s, v6.4s, v29.4s - orr v31.16b, v0.16b, v31.16b - add v23.4s, v5.4s, v30.4s - eor v0.16b, v4.16b, v12.16b - eor v1.16b, v3.16b, v13.16b - add v16.4s, v31.4s, v24.4s - eor v20.16b, v23.16b, v25.16b - ushr v24.4s, v0.4s, #7 + eor v0.16b, v19.16b, v3.16b + ushr v3.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + orr v4.16b, v2.16b, v4.16b + add v2.4s, v6.4s, v31.4s + orr v7.16b, v17.16b, v24.16b + orr v17.16b, v0.16b, v3.16b + add v0.4s, v5.4s, v28.4s + add v24.4s, v2.4s, v7.4s + add v3.4s, v0.4s, v14.4s + eor v1.16b, v24.16b, v1.16b + ushr v5.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + eor v0.16b, v3.16b, v16.16b + mov v21.16b, v31.16b + ushr v2.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + orr v31.16b, v1.16b, v5.16b + add v5.4s, v17.4s, v29.4s + mov v11.16b, v26.16b + mov v26.16b, v30.16b + orr v30.16b, v0.16b, v2.16b + stur q26, [x29, #-208] + add v6.4s, v4.4s, v8.4s + eor v0.16b, v5.16b, v23.16b + ushr v2.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v29.4s, v1.4s, #7 + eor v1.16b, v6.16b, v13.16b + ldr q13, [sp, #160] + ushr v23.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v30.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v25.16b, v0.16b, v24.16b - orr v0.16b, v1.16b, v29.16b - mov v29.16b, v10.16b - orr v1.16b, v20.16b, v30.16b - mov v20.16b, v10.16b - mov v24.16b, v21.16b - ldr q20, [sp, #224] - mov v29.d[1], v8.d[0] - mov v13.16b, v9.16b - zip2 v30.2d, v10.2d, v8.2d - zip2 v8.2d, v21.2d, v20.2d - mov v26.16b, v11.16b - mov v24.d[1], v20.d[0] - add v20.4s, v28.4s, v29.4s - mov v13.d[1], v14.d[0] - str q8, [sp, #128] - eor v2.16b, v16.16b, v2.16b - mov v26.d[1], v15.d[0] - str q24, [sp, #192] - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v13.4s - ushr v12.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - zip2 v10.2d, v9.2d, v14.2d - add v18.4s, v18.4s, v24.4s - add v17.4s, v17.4s, v26.4s - mov v14.16b, v26.16b - eor v26.16b, v20.16b, v31.16b - stp q10, q30, [sp, #224] - add v19.4s, v19.4s, v25.4s - orr v2.16b, v2.16b, v12.16b - add v18.4s, v18.4s, v1.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - eor v7.16b, v18.16b, v7.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - eor v6.16b, v17.16b, v6.16b - rev32 v7.8h, v7.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - rev32 v6.8h, v6.8h - add v16.4s, v16.4s, v7.4s + orr v16.16b, v0.16b, v2.16b + add v28.4s, v30.4s, v25.4s + ldr q25, [sp, #272] + add v2.4s, v31.4s, v9.4s + orr v8.16b, v1.16b, v23.16b + eor v0.16b, v28.16b, v14.16b + eor v1.16b, v2.16b, v7.16b + ushr v7.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v23.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v0.16b, v0.16b, v7.16b + mov v7.16b, v25.16b + orr v1.16b, v1.16b, v23.16b + mov v14.16b, v7.16b + ldp q23, q7, [sp, #240] + zip2 v25.2d, v25.2d, v27.2d + mov v9.16b, v23.16b + mov v9.d[1], v7.d[0] + mov v14.d[1], v27.d[0] + zip2 v29.2d, v23.2d, v7.2d + mov v27.16b, v12.16b + zip2 v23.2d, v12.2d, v20.2d + str q9, [sp, #240] + mov v12.16b, v10.16b + mov v12.d[1], v15.d[0] + add v18.4s, v18.4s, v9.4s + add v7.4s, v19.4s, v14.4s + stp q23, q25, [sp, #128] + add v18.4s, v18.4s, v0.4s + add v19.4s, v24.4s, v12.4s + stp q12, q14, [sp, #256] + eor v17.16b, v18.16b, v17.16b + ldur q12, [x29, #-224] + add v19.4s, v19.4s, v16.4s + rev32 v17.8h, v17.8h + eor v24.16b, v19.16b, v30.16b + rev32 v24.8h, v24.8h + mov v27.d[1], v20.d[0] + add v2.4s, v2.4s, v17.4s + add v7.4s, v7.4s, v8.4s + zip2 v10.2d, v10.2d, v15.2d + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s + add v3.4s, v3.4s, v27.4s + eor v20.16b, v7.16b, v31.16b + str q10, [sp, #176] ushr v31.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - add v4.4s, v4.4s, v6.4s - eor v1.16b, v16.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v29.4s + add v3.4s, v3.4s, v1.4s orr v0.16b, v0.16b, v31.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - add v20.4s, v20.4s, v30.4s - zip2 v21.2d, v11.2d, v15.2d - ushr v11.4s, v1.4s, #12 - shl v1.4s, v1.4s, #20 - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v31.16b + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + rev32 v20.8h, v20.8h + add v18.4s, v18.4s, v0.4s add v19.4s, v19.4s, v10.4s - add v20.4s, v20.4s, v0.4s - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v18.4s, v18.4s, v8.4s - add v19.4s, v19.4s, v25.4s - eor v26.16b, v20.16b, v26.16b - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v21.4s - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - ushr v31.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v17.4s, v17.4s, v2.4s - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - orr v26.16b, v26.16b, v31.16b - eor v6.16b, v17.16b, v6.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - add v23.4s, v26.4s, v23.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - eor v0.16b, v23.16b, v0.16b - ldp q28, q12, [x29, #-256] - orr v6.16b, v6.16b, v11.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b + eor v4.16b, v3.16b, v4.16b + orr v16.16b, v16.16b, v31.16b + rev32 v4.8h, v4.8h + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + mov v14.16b, v27.16b + add v27.4s, v28.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + orr v17.16b, v17.16b, v31.16b + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v27.16b, v8.16b + add v5.4s, v5.4s, v4.4s + add v2.4s, v17.4s, v2.4s + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v31.16b + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v25.4s + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v4.4s, v6.4s, v4.4s - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b + add v7.4s, v7.4s, v28.4s + add v3.4s, v3.4s, v23.4s + eor v16.16b, v6.16b, v16.16b + orr v1.16b, v1.16b, v30.16b orr v0.16b, v0.16b, v31.16b - add v18.4s, v18.4s, v12.4s - mov v15.16b, v29.16b - ldur q29, [x29, #-208] - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + eor v20.16b, v7.16b, v20.16b + add v3.4s, v3.4s, v1.4s + add v7.4s, v7.4s, v22.4s + orr v16.16b, v16.16b, v31.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + eor v4.16b, v3.16b, v4.16b + add v3.4s, v3.4s, v21.4s + add v7.4s, v7.4s, v16.4s + orr v20.16b, v20.16b, v30.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v3.4s, v3.4s, v0.4s + eor v17.16b, v17.16b, v7.16b + add v27.4s, v20.4s, v27.4s + rev32 v17.8h, v17.8h + orr v4.16b, v4.16b, v30.16b + eor v24.16b, v3.16b, v24.16b + eor v28.16b, v27.16b, v28.16b + add v5.4s, v4.4s, v5.4s + rev32 v24.8h, v24.8h + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v5.16b, v1.16b + mov v15.16b, v21.16b + ldur q21, [x29, #-256] + add v5.4s, v5.4s, v17.4s + str q15, [sp, #208] + mov v8.16b, v10.16b + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - str q15, [sp, #160] - add v20.4s, v20.4s, v29.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v27.4s - eor v6.16b, v6.16b, v18.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v28.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - add v3.4s, v3.4s, v6.4s - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - eor v0.16b, v3.16b, v0.16b - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - ushr v11.4s, v0.4s, #12 + mov v10.16b, v26.16b + add v18.4s, v18.4s, v26.4s + ldur q26, [x29, #-192] + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + orr v1.16b, v1.16b, v30.16b + add v19.4s, v19.4s, v21.4s + add v18.4s, v18.4s, v28.4s + ushr v30.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v0.16b, v27.16b, v0.16b + add v7.4s, v7.4s, v26.4s + add v19.4s, v19.4s, v1.4s + eor v4.16b, v4.16b, v18.16b + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v16.4s, v16.4s, v26.4s - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v18.4s, v18.4s, v24.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + rev32 v4.8h, v4.8h + add v7.4s, v7.4s, v16.4s + add v3.4s, v3.4s, v13.4s + eor v20.16b, v19.16b, v20.16b + orr v0.16b, v0.16b, v30.16b + rev32 v20.8h, v20.8h + eor v17.16b, v7.16b, v17.16b + add v3.4s, v3.4s, v0.4s + ushr v30.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v6.4s, v6.4s, v4.4s + eor v24.16b, v3.16b, v24.16b + orr v17.16b, v17.16b, v30.16b + ushr v30.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v6.16b, v28.16b + add v2.4s, v2.4s, v20.4s + add v5.4s, v17.4s, v5.4s + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v30.16b + eor v1.16b, v2.16b, v1.16b + add v18.4s, v18.4s, v9.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v24.4s, v27.4s + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v20.4s, v20.4s, v22.4s - add v18.4s, v18.4s, v0.4s - mov v9.16b, v30.16b - mov v30.16b, v21.16b - ldur q21, [x29, #-224] - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + ushr v30.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + add v18.4s, v18.4s, v28.4s + add v19.4s, v19.4s, v23.4s + eor v0.16b, v27.16b, v0.16b orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - str q30, [sp, #144] - add v17.4s, v17.4s, v21.4s - ldur q21, [x29, #-192] - eor v6.16b, v18.16b, v6.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v30.4s - eor v7.16b, v20.16b, v7.16b - add v17.4s, v17.4s, v1.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v19.4s, v19.4s, v2.4s - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - eor v5.16b, v17.16b, v5.16b - orr v6.16b, v6.16b, v11.16b - eor v26.16b, v19.16b, v26.16b - orr v7.16b, v7.16b, v31.16b - ushr v31.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - add v3.4s, v6.4s, v3.4s - ushr v11.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v4.4s, v7.4s, v4.4s - orr v5.16b, v5.16b, v31.16b - eor v0.16b, v3.16b, v0.16b - orr v26.16b, v26.16b, v11.16b - eor v25.16b, v4.16b, v25.16b - add v23.4s, v5.4s, v23.4s - ushr v11.4s, v0.4s, #7 + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v16.4s, v26.4s, v16.4s - ushr v31.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v20.4s, v20.4s, v21.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + eor v4.16b, v18.16b, v4.16b + add v19.4s, v19.4s, v1.4s + add v18.4s, v18.4s, v14.4s + orr v0.16b, v0.16b, v30.16b + ushr v31.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + eor v20.16b, v19.16b, v20.16b + add v19.4s, v19.4s, v8.4s + add v18.4s, v18.4s, v0.4s + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v19.4s, v19.4s, v16.4s + eor v17.16b, v18.16b, v17.16b + add v6.4s, v4.4s, v6.4s + rev32 v17.8h, v17.8h + orr v20.16b, v20.16b, v31.16b + eor v24.16b, v19.16b, v24.16b + eor v28.16b, v6.16b, v28.16b + add v2.4s, v20.4s, v2.4s + rev32 v24.8h, v24.8h + ushr v31.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v2.16b, v1.16b + add v7.4s, v7.4s, v11.4s + add v2.4s, v2.4s, v17.4s + ldp q11, q8, [sp, #256] + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v10.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s orr v1.16b, v1.16b, v31.16b - add v18.4s, v18.4s, v14.4s - eor v26.16b, v20.16b, v26.16b - add v19.4s, v19.4s, v25.4s - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v9.4s - ldr q9, [sp, #208] - add v18.4s, v18.4s, v1.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - eor v7.16b, v18.16b, v7.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - eor v6.16b, v17.16b, v6.16b - rev32 v7.8h, v7.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - rev32 v6.8h, v6.8h - add v16.4s, v16.4s, v7.4s + add v3.4s, v3.4s, v25.4s + add v7.4s, v7.4s, v28.4s ushr v31.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - add v4.4s, v4.4s, v6.4s - eor v1.16b, v16.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v12.4s + add v3.4s, v3.4s, v1.4s + eor v20.16b, v7.16b, v20.16b orr v0.16b, v0.16b, v31.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - add v20.4s, v20.4s, v8.4s - ushr v11.4s, v1.4s, #12 + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + rev32 v20.8h, v20.8h + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v8.4s + eor v4.16b, v3.16b, v4.16b + orr v16.16b, v16.16b, v31.16b + rev32 v4.8h, v4.8h + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v27.4s, v27.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + orr v17.16b, v17.16b, v31.16b + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v27.16b, v28.16b + add v5.4s, v5.4s, v4.4s + add v2.4s, v17.4s, v2.4s + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v31.16b + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v29.4s + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v31.16b - add v19.4s, v19.4s, v15.4s - add v20.4s, v20.4s, v0.4s - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v18.4s, v18.4s, v9.4s - add v19.4s, v19.4s, v25.4s - eor v26.16b, v20.16b, v26.16b - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v13.4s - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - ushr v31.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v17.4s, v17.4s, v2.4s - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - orr v26.16b, v26.16b, v31.16b - eor v6.16b, v17.16b, v6.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - add v23.4s, v26.4s, v23.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - eor v0.16b, v23.16b, v0.16b - orr v6.16b, v6.16b, v11.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v4.4s, v6.4s, v4.4s - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b + add v7.4s, v7.4s, v28.4s + add v3.4s, v3.4s, v11.4s + eor v16.16b, v6.16b, v16.16b + orr v1.16b, v1.16b, v30.16b orr v0.16b, v0.16b, v31.16b - add v18.4s, v18.4s, v24.4s - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + eor v20.16b, v7.16b, v20.16b + add v3.4s, v3.4s, v1.4s + add v7.4s, v7.4s, v10.4s + orr v16.16b, v16.16b, v31.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + eor v4.16b, v3.16b, v4.16b + add v3.4s, v3.4s, v23.4s + add v7.4s, v7.4s, v16.4s + orr v20.16b, v20.16b, v30.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v3.4s, v3.4s, v0.4s + eor v17.16b, v17.16b, v7.16b + add v27.4s, v20.4s, v27.4s + rev32 v17.8h, v17.8h + orr v4.16b, v4.16b, v30.16b + eor v24.16b, v3.16b, v24.16b + eor v28.16b, v27.16b, v28.16b + add v5.4s, v4.4s, v5.4s + rev32 v24.8h, v24.8h + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v5.16b, v1.16b + add v5.4s, v5.4s, v17.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v12.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v30.4s - eor v6.16b, v6.16b, v18.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v27.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - add v3.4s, v3.4s, v6.4s - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - eor v0.16b, v3.16b, v0.16b - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - ushr v11.4s, v0.4s, #12 + add v18.4s, v18.4s, v9.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + orr v1.16b, v1.16b, v30.16b + add v19.4s, v19.4s, v15.4s + add v18.4s, v18.4s, v28.4s + ushr v30.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v0.16b, v27.16b, v0.16b + add v7.4s, v7.4s, v21.4s + add v19.4s, v19.4s, v1.4s + eor v4.16b, v4.16b, v18.16b + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v16.4s, v16.4s, v26.4s - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b + rev32 v4.8h, v4.8h + add v7.4s, v7.4s, v16.4s + add v3.4s, v3.4s, v22.4s + ldur q22, [x29, #-192] + eor v20.16b, v19.16b, v20.16b + orr v0.16b, v0.16b, v30.16b + rev32 v20.8h, v20.8h + eor v17.16b, v7.16b, v17.16b + add v3.4s, v3.4s, v0.4s + ushr v30.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v6.4s, v6.4s, v4.4s + eor v24.16b, v3.16b, v24.16b + orr v17.16b, v17.16b, v30.16b + ushr v30.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v6.16b, v28.16b + add v2.4s, v2.4s, v20.4s + add v5.4s, v17.4s, v5.4s + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v30.16b + eor v1.16b, v2.16b, v1.16b add v18.4s, v18.4s, v14.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + eor v16.16b, v5.16b, v16.16b + add v27.4s, v24.4s, v27.4s + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v20.4s, v20.4s, v28.4s - add v18.4s, v18.4s, v0.4s - mov v10.16b, v13.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + ushr v30.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + add v18.4s, v18.4s, v28.4s + add v19.4s, v19.4s, v11.4s + eor v0.16b, v27.16b, v0.16b orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v29.4s - eor v6.16b, v18.16b, v6.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v10.4s - eor v7.16b, v20.16b, v7.16b - add v17.4s, v17.4s, v1.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v19.4s, v19.4s, v2.4s - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - eor v5.16b, v17.16b, v5.16b - orr v6.16b, v6.16b, v11.16b - eor v26.16b, v19.16b, v26.16b - orr v7.16b, v7.16b, v31.16b - ushr v31.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - add v3.4s, v6.4s, v3.4s - ushr v11.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v4.4s, v7.4s, v4.4s - orr v5.16b, v5.16b, v31.16b - eor v0.16b, v3.16b, v0.16b - mov v22.16b, v8.16b - ldp q8, q28, [sp, #240] - orr v26.16b, v26.16b, v11.16b - eor v25.16b, v4.16b, v25.16b - add v23.4s, v5.4s, v23.4s - ushr v11.4s, v0.4s, #7 + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v16.4s, v26.4s, v16.4s - ushr v31.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v20.4s, v20.4s, v28.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + eor v4.16b, v18.16b, v4.16b + add v19.4s, v19.4s, v1.4s + add v18.4s, v18.4s, v25.4s + orr v0.16b, v0.16b, v30.16b + ushr v31.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + eor v20.16b, v19.16b, v20.16b + add v19.4s, v19.4s, v8.4s + add v18.4s, v18.4s, v0.4s + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v19.4s, v19.4s, v16.4s + eor v17.16b, v18.16b, v17.16b + add v6.4s, v4.4s, v6.4s + rev32 v17.8h, v17.8h + orr v20.16b, v20.16b, v31.16b + eor v24.16b, v19.16b, v24.16b + eor v28.16b, v6.16b, v28.16b + add v2.4s, v20.4s, v2.4s + rev32 v24.8h, v24.8h + ushr v31.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v2.16b, v1.16b + add v2.4s, v2.4s, v17.4s + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v15.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + add v7.4s, v7.4s, v26.4s + ldur q26, [x29, #-240] + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s orr v1.16b, v1.16b, v31.16b - add v18.4s, v18.4s, v8.4s - eor v26.16b, v20.16b, v26.16b - add v19.4s, v19.4s, v25.4s - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v22.4s - ldur q22, [x29, #-256] - add v18.4s, v18.4s, v1.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - eor v7.16b, v18.16b, v7.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - eor v6.16b, v17.16b, v6.16b - rev32 v7.8h, v7.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - rev32 v6.8h, v6.8h - add v16.4s, v16.4s, v7.4s + add v3.4s, v3.4s, v29.4s + add v7.4s, v7.4s, v28.4s ushr v31.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - add v4.4s, v4.4s, v6.4s - eor v1.16b, v16.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v13.4s + add v3.4s, v3.4s, v1.4s + eor v20.16b, v7.16b, v20.16b orr v0.16b, v0.16b, v31.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - add v20.4s, v20.4s, v9.4s - mov v13.16b, v12.16b - mov v12.16b, v27.16b - mov v27.16b, v9.16b - ldur q9, [x29, #-192] - mov v21.16b, v15.16b - ldr q15, [sp, #224] - ushr v11.4s, v1.4s, #12 - ldur q21, [x29, #-224] + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + rev32 v20.8h, v20.8h + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v26.4s + eor v4.16b, v3.16b, v4.16b + orr v16.16b, v16.16b, v31.16b + rev32 v4.8h, v4.8h + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v27.4s, v27.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + orr v17.16b, v17.16b, v31.16b + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v27.16b, v28.16b + add v5.4s, v5.4s, v4.4s + add v2.4s, v17.4s, v2.4s + mov v10.16b, v21.16b + mov v21.16b, v14.16b + mov v14.16b, v11.16b + ldr q11, [sp, #176] + ushr v30.4s, v28.4s, #12 + str q21, [sp, #192] + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v31.16b + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v12.4s + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v31.16b - add v19.4s, v19.4s, v9.4s - add v20.4s, v20.4s, v0.4s - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v18.4s, v18.4s, v21.4s - add v19.4s, v19.4s, v25.4s - eor v26.16b, v20.16b, v26.16b - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v15.4s - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - ushr v31.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v17.4s, v17.4s, v2.4s - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - orr v26.16b, v26.16b, v31.16b - eor v6.16b, v17.16b, v6.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - add v23.4s, v26.4s, v23.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - eor v0.16b, v23.16b, v0.16b - orr v6.16b, v6.16b, v11.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v4.4s, v6.4s, v4.4s - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b + add v7.4s, v7.4s, v28.4s + add v3.4s, v3.4s, v11.4s + eor v16.16b, v6.16b, v16.16b + orr v1.16b, v1.16b, v30.16b orr v0.16b, v0.16b, v31.16b - add v18.4s, v18.4s, v14.4s - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + eor v20.16b, v7.16b, v20.16b + add v3.4s, v3.4s, v1.4s + add v7.4s, v7.4s, v9.4s + orr v16.16b, v16.16b, v31.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + eor v4.16b, v3.16b, v4.16b + add v3.4s, v3.4s, v14.4s + add v7.4s, v7.4s, v16.4s + orr v20.16b, v20.16b, v30.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v3.4s, v3.4s, v0.4s + eor v17.16b, v17.16b, v7.16b + add v27.4s, v20.4s, v27.4s + rev32 v17.8h, v17.8h + orr v4.16b, v4.16b, v30.16b + eor v24.16b, v3.16b, v24.16b + eor v28.16b, v27.16b, v28.16b + add v5.4s, v4.4s, v5.4s + rev32 v24.8h, v24.8h + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v5.16b, v1.16b + add v5.4s, v5.4s, v17.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v24.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v10.4s - eor v6.16b, v6.16b, v18.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v30.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - add v3.4s, v3.4s, v6.4s - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - eor v0.16b, v3.16b, v0.16b - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - ushr v11.4s, v0.4s, #12 + add v18.4s, v18.4s, v21.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + mov v8.16b, v13.16b + orr v1.16b, v1.16b, v30.16b + mov v13.16b, v9.16b + mov v9.16b, v21.16b + ldur q21, [x29, #-208] + add v19.4s, v19.4s, v23.4s + add v18.4s, v18.4s, v28.4s + ushr v30.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v0.16b, v27.16b, v0.16b + add v7.4s, v7.4s, v15.4s + add v19.4s, v19.4s, v1.4s + eor v4.16b, v4.16b, v18.16b + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v16.4s, v16.4s, v26.4s - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v18.4s, v18.4s, v8.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + rev32 v4.8h, v4.8h + add v7.4s, v7.4s, v16.4s + add v3.4s, v3.4s, v21.4s + ldur q21, [x29, #-208] + eor v20.16b, v19.16b, v20.16b + orr v0.16b, v0.16b, v30.16b + rev32 v20.8h, v20.8h + eor v17.16b, v7.16b, v17.16b + add v3.4s, v3.4s, v0.4s + ushr v30.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v6.4s, v6.4s, v4.4s + eor v24.16b, v3.16b, v24.16b + orr v17.16b, v17.16b, v30.16b + ushr v30.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v6.16b, v28.16b + add v2.4s, v2.4s, v20.4s + add v5.4s, v17.4s, v5.4s + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v30.16b + eor v1.16b, v2.16b, v1.16b + add v18.4s, v18.4s, v25.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v24.4s, v27.4s + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v20.4s, v20.4s, v12.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + ushr v30.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + add v18.4s, v18.4s, v28.4s + add v19.4s, v19.4s, v11.4s + eor v0.16b, v27.16b, v0.16b orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v13.4s - ldr q13, [sp, #160] - eor v6.16b, v18.16b, v6.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v15.4s - eor v7.16b, v20.16b, v7.16b - add v17.4s, v17.4s, v1.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v19.4s, v19.4s, v2.4s - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - eor v5.16b, v17.16b, v5.16b - orr v6.16b, v6.16b, v11.16b - eor v26.16b, v19.16b, v26.16b - orr v7.16b, v7.16b, v31.16b - ushr v31.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - add v3.4s, v6.4s, v3.4s - ushr v11.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v4.4s, v7.4s, v4.4s - orr v5.16b, v5.16b, v31.16b - eor v0.16b, v3.16b, v0.16b - orr v26.16b, v26.16b, v11.16b - eor v25.16b, v4.16b, v25.16b - add v23.4s, v5.4s, v23.4s - ushr v11.4s, v0.4s, #7 + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v16.4s, v26.4s, v16.4s - ushr v31.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v20.4s, v20.4s, v22.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + eor v4.16b, v18.16b, v4.16b + add v19.4s, v19.4s, v1.4s + add v18.4s, v18.4s, v29.4s + orr v0.16b, v0.16b, v30.16b + ushr v31.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + eor v20.16b, v19.16b, v20.16b + add v19.4s, v19.4s, v26.4s + ldr q26, [sp, #224] + add v18.4s, v18.4s, v0.4s + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v19.4s, v19.4s, v16.4s + eor v17.16b, v18.16b, v17.16b + add v6.4s, v4.4s, v6.4s + rev32 v17.8h, v17.8h + orr v20.16b, v20.16b, v31.16b + eor v24.16b, v19.16b, v24.16b + eor v28.16b, v6.16b, v28.16b + add v2.4s, v20.4s, v2.4s + rev32 v24.8h, v24.8h + ushr v31.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v2.16b, v1.16b + add v2.4s, v2.4s, v17.4s + orr v28.16b, v28.16b, v31.16b ushr v31.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v9.4s - mov v29.16b, v14.16b - ldr q14, [sp, #128] - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + add v7.4s, v7.4s, v10.4s + ldr q10, [sp, #128] + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s orr v1.16b, v1.16b, v31.16b - add v18.4s, v18.4s, v14.4s - eor v26.16b, v20.16b, v26.16b - add v19.4s, v19.4s, v25.4s - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v27.4s - add v18.4s, v18.4s, v1.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - eor v7.16b, v18.16b, v7.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - eor v6.16b, v17.16b, v6.16b - rev32 v7.8h, v7.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - rev32 v6.8h, v6.8h - add v16.4s, v16.4s, v7.4s + add v3.4s, v3.4s, v12.4s + ldr q12, [sp, #272] + add v7.4s, v7.4s, v28.4s ushr v31.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - add v4.4s, v4.4s, v6.4s - eor v1.16b, v16.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v26.4s + add v3.4s, v3.4s, v1.4s + eor v20.16b, v7.16b, v20.16b orr v0.16b, v0.16b, v31.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - add v20.4s, v20.4s, v21.4s - ushr v11.4s, v1.4s, #12 + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + rev32 v20.8h, v20.8h + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v22.4s + eor v4.16b, v3.16b, v4.16b + orr v16.16b, v16.16b, v31.16b + rev32 v4.8h, v4.8h + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + add v27.4s, v27.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + orr v17.16b, v17.16b, v31.16b + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v28.16b, v27.16b, v28.16b + add v5.4s, v5.4s, v4.4s + add v2.4s, v17.4s, v2.4s + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + orr v24.16b, v24.16b, v31.16b + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v8.4s + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v31.16b - add v19.4s, v19.4s, v28.4s - add v20.4s, v20.4s, v0.4s - mov v12.16b, v27.16b - ldur q27, [x29, #-208] - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v18.4s, v18.4s, v27.4s - add v19.4s, v19.4s, v25.4s - eor v26.16b, v20.16b, v26.16b - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v13.4s - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - ushr v31.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v17.4s, v17.4s, v2.4s - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - orr v26.16b, v26.16b, v31.16b - eor v6.16b, v17.16b, v6.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - add v23.4s, v26.4s, v23.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - eor v0.16b, v23.16b, v0.16b - orr v6.16b, v6.16b, v11.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v4.4s, v6.4s, v4.4s - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b + add v7.4s, v7.4s, v28.4s + add v3.4s, v3.4s, v12.4s + eor v16.16b, v6.16b, v16.16b + orr v1.16b, v1.16b, v30.16b orr v0.16b, v0.16b, v31.16b - add v18.4s, v18.4s, v8.4s - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + eor v20.16b, v7.16b, v20.16b + add v3.4s, v3.4s, v1.4s + add v7.4s, v7.4s, v9.4s + orr v16.16b, v16.16b, v31.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + eor v4.16b, v3.16b, v4.16b + add v3.4s, v3.4s, v11.4s + add v7.4s, v7.4s, v16.4s + orr v20.16b, v20.16b, v30.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v3.4s, v3.4s, v0.4s + eor v17.16b, v17.16b, v7.16b + add v27.4s, v20.4s, v27.4s + rev32 v17.8h, v17.8h + orr v4.16b, v4.16b, v30.16b + eor v24.16b, v3.16b, v24.16b + eor v28.16b, v27.16b, v28.16b + add v5.4s, v4.4s, v5.4s + rev32 v24.8h, v24.8h + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v1.16b, v5.16b, v1.16b + add v18.4s, v18.4s, v25.4s + add v5.4s, v5.4s, v17.4s + orr v28.16b, v28.16b, v30.16b + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v29.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v15.4s - eor v6.16b, v6.16b, v18.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v10.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - add v3.4s, v3.4s, v6.4s - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - eor v0.16b, v3.16b, v0.16b - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - ushr v11.4s, v0.4s, #12 + add v18.4s, v18.4s, v28.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + orr v1.16b, v1.16b, v30.16b + add v19.4s, v19.4s, v14.4s + ushr v30.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v4.16b, v4.16b, v18.16b + eor v0.16b, v27.16b, v0.16b + add v7.4s, v7.4s, v10.4s + add v19.4s, v19.4s, v1.4s + rev32 v4.8h, v4.8h + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v16.4s, v16.4s, v26.4s - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v18.4s, v18.4s, v14.4s - mov v30.16b, v29.16b - mov v29.16b, v15.16b - ldr q15, [sp, #144] - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + add v7.4s, v7.4s, v16.4s + add v3.4s, v3.4s, v13.4s + ldur q13, [x29, #-224] + eor v20.16b, v19.16b, v20.16b + orr v0.16b, v0.16b, v30.16b + rev32 v20.8h, v20.8h + eor v17.16b, v7.16b, v17.16b + add v3.4s, v3.4s, v0.4s + add v6.4s, v6.4s, v4.4s + ushr v30.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + eor v24.16b, v3.16b, v24.16b + eor v28.16b, v6.16b, v28.16b + orr v17.16b, v17.16b, v30.16b + ushr v30.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v2.4s, v2.4s, v20.4s + add v5.4s, v17.4s, v5.4s + add v18.4s, v18.4s, v29.4s + orr v24.16b, v24.16b, v30.16b + orr v28.16b, v28.16b, v31.16b + eor v1.16b, v2.16b, v1.16b + eor v16.16b, v5.16b, v16.16b + add v27.4s, v24.4s, v27.4s ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v20.4s, v20.4s, v15.4s - add v18.4s, v18.4s, v0.4s - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + ushr v30.4s, v16.4s, #7 + add v18.4s, v18.4s, v28.4s + shl v16.4s, v16.4s, #25 + add v19.4s, v19.4s, v12.4s + eor v0.16b, v27.16b, v0.16b orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v24.4s - eor v6.16b, v18.16b, v6.16b - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v13.4s - eor v7.16b, v20.16b, v7.16b - add v17.4s, v17.4s, v1.4s - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v19.4s, v19.4s, v2.4s - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - eor v5.16b, v17.16b, v5.16b - orr v6.16b, v6.16b, v11.16b - eor v26.16b, v19.16b, v26.16b - orr v7.16b, v7.16b, v31.16b - ushr v31.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - add v3.4s, v6.4s, v3.4s - ushr v11.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v4.4s, v7.4s, v4.4s - orr v5.16b, v5.16b, v31.16b - eor v0.16b, v3.16b, v0.16b - orr v26.16b, v26.16b, v11.16b - eor v25.16b, v4.16b, v25.16b - add v23.4s, v5.4s, v23.4s - ushr v11.4s, v0.4s, #7 + eor v4.16b, v18.16b, v4.16b + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - mov v9.16b, v28.16b - mov v28.16b, v10.16b - ldr q10, [sp, #176] - add v16.4s, v26.4s, v16.4s - ushr v31.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v23.16b, v1.16b - orr v0.16b, v0.16b, v11.16b - add v20.4s, v20.4s, v10.4s - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v19.4s, v19.4s, v1.4s + add v18.4s, v18.4s, v13.4s + orr v0.16b, v0.16b, v30.16b + orr v4.16b, v4.16b, v31.16b + eor v20.16b, v19.16b, v20.16b + add v19.4s, v19.4s, v22.4s + add v18.4s, v18.4s, v0.4s + add v6.4s, v4.4s, v6.4s + ushr v31.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v19.4s, v19.4s, v16.4s + eor v17.16b, v18.16b, v17.16b + eor v28.16b, v6.16b, v28.16b + rev32 v17.8h, v17.8h + mov v23.16b, v15.16b + orr v20.16b, v20.16b, v31.16b + ushr v31.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + eor v24.16b, v19.16b, v24.16b + add v2.4s, v20.4s, v2.4s + add v7.4s, v7.4s, v23.4s + rev32 v24.8h, v24.8h + orr v28.16b, v28.16b, v31.16b + eor v1.16b, v2.16b, v1.16b + add v7.4s, v7.4s, v28.4s + add v2.4s, v2.4s, v17.4s + mov v15.16b, v8.16b ushr v31.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v9.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + mov v23.16b, v15.16b + add v3.4s, v3.4s, v15.4s + eor v20.16b, v7.16b, v20.16b + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s + ldp q15, q9, [x29, #-256] orr v1.16b, v1.16b, v31.16b - add v18.4s, v18.4s, v12.4s - eor v26.16b, v20.16b, v26.16b - add v19.4s, v19.4s, v25.4s - orr v2.16b, v2.16b, v11.16b - add v17.4s, v17.4s, v21.4s - add v18.4s, v18.4s, v1.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - eor v7.16b, v18.16b, v7.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - eor v6.16b, v17.16b, v6.16b - rev32 v7.8h, v7.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - rev32 v6.8h, v6.8h - add v16.4s, v16.4s, v7.4s ushr v31.4s, v0.4s, #12 + rev32 v20.8h, v20.8h shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - add v4.4s, v4.4s, v6.4s - eor v1.16b, v16.16b, v1.16b + add v3.4s, v3.4s, v1.4s + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v21.4s orr v0.16b, v0.16b, v31.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - ushr v11.4s, v1.4s, #12 + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v4.16b, v3.16b, v4.16b + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v15.4s + rev32 v4.8h, v4.8h + add v27.4s, v27.4s, v20.4s + orr v16.16b, v16.16b, v31.16b + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + eor v28.16b, v27.16b, v28.16b + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + eor v24.16b, v19.16b, v24.16b + add v5.4s, v5.4s, v4.4s + add v7.4s, v7.4s, v26.4s + orr v17.16b, v17.16b, v31.16b + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + orr v28.16b, v28.16b, v30.16b + add v2.4s, v17.4s, v2.4s + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v28.4s + orr v24.16b, v24.16b, v31.16b + ushr v30.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - eor v2.16b, v4.16b, v2.16b - add v20.4s, v20.4s, v27.4s - orr v25.16b, v25.16b, v31.16b - add v19.4s, v19.4s, v22.4s - mov v9.16b, v22.16b - ldur q22, [x29, #-240] - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v20.4s, v20.4s, v0.4s - add v18.4s, v18.4s, v22.4s - add v19.4s, v19.4s, v25.4s - mov v24.16b, v21.16b - ldur q21, [x29, #-192] - orr v2.16b, v2.16b, v11.16b - eor v26.16b, v20.16b, v26.16b - add v17.4s, v17.4s, v21.4s - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - ushr v31.4s, v26.4s, #8 - add v17.4s, v17.4s, v2.4s - shl v26.4s, v26.4s, #24 - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - orr v26.16b, v26.16b, v31.16b - eor v6.16b, v17.16b, v6.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v23.4s, v26.4s, v23.4s - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - orr v6.16b, v6.16b, v11.16b - eor v0.16b, v23.16b, v0.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b - add v4.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + eor v20.16b, v7.16b, v20.16b + add v3.4s, v3.4s, v9.4s ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b + orr v1.16b, v1.16b, v30.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + eor v16.16b, v6.16b, v16.16b + mov v8.16b, v25.16b + add v3.4s, v3.4s, v1.4s orr v0.16b, v0.16b, v31.16b - eor v2.16b, v4.16b, v2.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + orr v20.16b, v20.16b, v30.16b + add v7.4s, v7.4s, v8.4s + ldr q8, [sp, #192] + eor v4.16b, v3.16b, v4.16b + add v27.4s, v20.4s, v27.4s + orr v16.16b, v16.16b, v31.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + add v3.4s, v3.4s, v12.4s + add v7.4s, v7.4s, v16.4s + eor v28.16b, v27.16b, v28.16b + orr v4.16b, v4.16b, v30.16b + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + add v3.4s, v3.4s, v0.4s + eor v17.16b, v17.16b, v7.16b + add v5.4s, v4.4s, v5.4s + add v18.4s, v18.4s, v29.4s + rev32 v17.8h, v17.8h + orr v28.16b, v28.16b, v30.16b + eor v24.16b, v3.16b, v24.16b + eor v1.16b, v5.16b, v1.16b + add v18.4s, v18.4s, v28.4s + rev32 v24.8h, v24.8h + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v8.4s - add v18.4s, v18.4s, v14.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v13.4s - add v18.4s, v18.4s, v0.4s - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v29.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - eor v6.16b, v6.16b, v18.16b - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - rev32 v6.8h, v6.8h - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - add v3.4s, v3.4s, v6.4s - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - eor v0.16b, v3.16b, v0.16b - add v16.4s, v16.4s, v26.4s - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - ushr v11.4s, v0.4s, #12 + add v19.4s, v19.4s, v11.4s + eor v4.16b, v4.16b, v18.16b + add v5.4s, v5.4s, v17.4s + orr v1.16b, v1.16b, v30.16b + rev32 v4.8h, v4.8h + add v19.4s, v19.4s, v1.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + ushr v30.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + eor v20.16b, v19.16b, v20.16b + eor v0.16b, v27.16b, v0.16b + add v7.4s, v7.4s, v14.4s + rev32 v20.8h, v20.8h + add v6.4s, v6.4s, v4.4s + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v1.16b, v23.16b, v1.16b - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b - orr v0.16b, v0.16b, v11.16b + add v7.4s, v7.4s, v16.4s + add v3.4s, v3.4s, v8.4s + eor v28.16b, v6.16b, v28.16b + orr v0.16b, v0.16b, v30.16b + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + eor v17.16b, v7.16b, v17.16b + add v3.4s, v3.4s, v0.4s + add v2.4s, v2.4s, v20.4s + add v18.4s, v18.4s, v13.4s + ushr v30.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + orr v28.16b, v28.16b, v31.16b + eor v24.16b, v3.16b, v24.16b + eor v1.16b, v2.16b, v1.16b + add v18.4s, v18.4s, v28.4s + orr v17.16b, v17.16b, v30.16b + ushr v30.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v20.4s, v20.4s, v28.4s - add v18.4s, v18.4s, v12.4s - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + add v5.4s, v17.4s, v5.4s + eor v4.16b, v18.16b, v4.16b + add v19.4s, v19.4s, v9.4s + orr v24.16b, v24.16b, v30.16b orr v1.16b, v1.16b, v31.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v30.4s - add v18.4s, v18.4s, v0.4s - orr v2.16b, v2.16b, v11.16b - add v19.4s, v19.4s, v21.4s - eor v7.16b, v20.16b, v7.16b - add v17.4s, v17.4s, v1.4s - eor v6.16b, v18.16b, v6.16b - add v19.4s, v19.4s, v2.4s - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - eor v5.16b, v17.16b, v5.16b - orr v7.16b, v7.16b, v31.16b - eor v26.16b, v19.16b, v26.16b - orr v6.16b, v6.16b, v11.16b - ushr v31.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - ushr v11.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - add v4.4s, v7.4s, v4.4s - orr v5.16b, v5.16b, v31.16b - add v3.4s, v6.4s, v3.4s - orr v26.16b, v26.16b, v11.16b - eor v25.16b, v4.16b, v25.16b - add v23.4s, v5.4s, v23.4s - eor v0.16b, v3.16b, v0.16b - add v16.4s, v26.4s, v16.4s - ushr v31.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - ushr v11.4s, v0.4s, #7 + ushr v31.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + eor v16.16b, v5.16b, v16.16b + add v27.4s, v24.4s, v27.4s + add v19.4s, v19.4s, v1.4s + ushr v30.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + orr v4.16b, v4.16b, v31.16b + eor v0.16b, v27.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + add v6.4s, v4.4s, v6.4s + orr v16.16b, v16.16b, v30.16b + ushr v30.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v1.16b, v23.16b, v1.16b - orr v25.16b, v25.16b, v31.16b - eor v2.16b, v16.16b, v2.16b - orr v0.16b, v0.16b, v11.16b + ushr v31.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v18.4s, v18.4s, v23.4s + eor v28.16b, v6.16b, v28.16b + orr v0.16b, v0.16b, v30.16b + orr v20.16b, v20.16b, v31.16b + ushr v31.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + add v19.4s, v19.4s, v15.4s + add v18.4s, v18.4s, v0.4s + add v2.4s, v20.4s, v2.4s + add v7.4s, v7.4s, v10.4s + orr v28.16b, v28.16b, v31.16b + add v19.4s, v19.4s, v16.4s + eor v17.16b, v18.16b, v17.16b + eor v1.16b, v2.16b, v1.16b + add v7.4s, v7.4s, v28.4s + rev32 v17.8h, v17.8h + mov v25.16b, v26.16b ushr v31.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v20.4s, v20.4s, v15.4s - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + eor v24.16b, v19.16b, v24.16b + add v3.4s, v3.4s, v25.4s + eor v20.16b, v7.16b, v20.16b + rev32 v24.8h, v24.8h orr v1.16b, v1.16b, v31.16b - add v18.4s, v18.4s, v24.4s - add v20.4s, v20.4s, v0.4s - add v19.4s, v19.4s, v9.4s - mov v8.16b, v13.16b - ldur q13, [x29, #-208] - orr v2.16b, v2.16b, v11.16b - add v18.4s, v18.4s, v1.4s - add v17.4s, v17.4s, v13.4s - eor v26.16b, v20.16b, v26.16b - add v19.4s, v19.4s, v25.4s - eor v7.16b, v18.16b, v7.16b - add v17.4s, v17.4s, v2.4s - rev32 v26.8h, v26.8h - eor v5.16b, v19.16b, v5.16b - rev32 v7.8h, v7.8h - eor v6.16b, v17.16b, v6.16b - add v23.4s, v23.4s, v26.4s - rev32 v5.8h, v5.8h - add v16.4s, v16.4s, v7.4s - rev32 v6.8h, v6.8h - eor v0.16b, v23.16b, v0.16b - add v3.4s, v3.4s, v5.4s - eor v1.16b, v16.16b, v1.16b - add v4.4s, v4.4s, v6.4s + rev32 v20.8h, v20.8h + add v3.4s, v3.4s, v1.4s + add v2.4s, v2.4s, v17.4s + mov v26.16b, v9.16b + ldr q9, [sp, #240] + eor v4.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + add v6.4s, v6.4s, v24.4s + mov v22.16b, v23.16b + ldr q23, [sp, #208] + rev32 v4.8h, v4.8h + add v27.4s, v27.4s, v20.4s ushr v31.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v25.16b, v3.16b, v25.16b - ushr v11.4s, v1.4s, #12 - shl v1.4s, v1.4s, #20 + eor v16.16b, v6.16b, v16.16b + add v18.4s, v18.4s, v9.4s + eor v28.16b, v27.16b, v28.16b orr v0.16b, v0.16b, v31.16b - eor v2.16b, v4.16b, v2.16b - ushr v31.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - orr v1.16b, v1.16b, v11.16b - ushr v11.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v20.4s, v20.4s, v22.4s - orr v25.16b, v25.16b, v31.16b - add v19.4s, v19.4s, v10.4s - mov v27.16b, v12.16b - mov v12.16b, v30.16b - mov v29.16b, v21.16b - mov v21.16b, v24.16b - ldr q24, [sp, #192] - mov v30.16b, v22.16b - ldr q22, [sp, #256] - orr v2.16b, v2.16b, v11.16b - add v20.4s, v20.4s, v0.4s - add v18.4s, v18.4s, v24.4s - add v19.4s, v19.4s, v25.4s - add v17.4s, v17.4s, v22.4s - eor v26.16b, v20.16b, v26.16b - add v18.4s, v18.4s, v1.4s - eor v5.16b, v19.16b, v5.16b - add v17.4s, v17.4s, v2.4s - ushr v31.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - ushr v11.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - eor v7.16b, v18.16b, v7.16b - eor v6.16b, v17.16b, v6.16b - orr v26.16b, v26.16b, v31.16b - orr v5.16b, v5.16b, v11.16b - ushr v31.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - ushr v11.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - add v23.4s, v26.4s, v23.4s - orr v7.16b, v7.16b, v31.16b - add v3.4s, v5.4s, v3.4s - orr v6.16b, v6.16b, v11.16b - eor v0.16b, v23.16b, v0.16b - add v16.4s, v7.4s, v16.4s - eor v25.16b, v3.16b, v25.16b - add v4.4s, v6.4s, v4.4s + ushr v31.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v23.4s + add v5.4s, v5.4s, v4.4s + orr v16.16b, v16.16b, v31.16b + add v7.4s, v7.4s, v21.4s + mov v14.16b, v10.16b + mov v10.16b, v25.16b + orr v28.16b, v28.16b, v30.16b + mov v25.16b, v21.16b + ldur q21, [x29, #-192] + eor v17.16b, v18.16b, v17.16b + add v19.4s, v19.4s, v16.4s + eor v1.16b, v5.16b, v1.16b + add v7.4s, v7.4s, v28.4s + ushr v31.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + ushr v30.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v24.16b, v19.16b, v24.16b + eor v20.16b, v7.16b, v20.16b + orr v17.16b, v17.16b, v31.16b + add v3.4s, v3.4s, v21.4s + ushr v31.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + orr v1.16b, v1.16b, v30.16b + ushr v30.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + add v2.4s, v17.4s, v2.4s + add v3.4s, v3.4s, v1.4s + orr v24.16b, v24.16b, v31.16b + orr v20.16b, v20.16b, v30.16b + eor v0.16b, v2.16b, v0.16b + add v6.4s, v24.4s, v6.4s + eor v4.16b, v3.16b, v4.16b + add v27.4s, v20.4s, v27.4s ushr v31.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v11.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - eor v1.16b, v16.16b, v1.16b - eor v2.16b, v4.16b, v2.16b + ushr v30.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + eor v16.16b, v6.16b, v16.16b + eor v28.16b, v27.16b, v28.16b orr v0.16b, v0.16b, v31.16b - orr v25.16b, v25.16b, v11.16b - ushr v31.4s, v1.4s, #7 + ushr v31.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + orr v4.16b, v4.16b, v30.16b + ushr v30.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + add v7.4s, v7.4s, v29.4s + add v5.4s, v4.4s, v5.4s + orr v16.16b, v16.16b, v31.16b + add v18.4s, v18.4s, v13.4s + orr v28.16b, v28.16b, v30.16b + add v7.4s, v7.4s, v16.4s + eor v1.16b, v5.16b, v1.16b + add v3.4s, v3.4s, v26.4s + ldr q26, [sp, #256] + add v18.4s, v18.4s, v28.4s + ushr v30.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v11.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v20.4s, v20.4s, v14.4s - add v18.4s, v18.4s, v27.4s - ldr q27, [sp, #224] - orr v1.16b, v1.16b, v31.16b - orr v2.16b, v2.16b, v11.16b - add v20.4s, v20.4s, v25.4s - add v17.4s, v17.4s, v29.4s - add v18.4s, v18.4s, v0.4s - add v19.4s, v19.4s, v8.4s - eor v7.16b, v7.16b, v20.16b - add v17.4s, v17.4s, v1.4s - eor v6.16b, v6.16b, v18.16b - add v19.4s, v19.4s, v2.4s - rev32 v7.8h, v7.8h - eor v5.16b, v17.16b, v5.16b - rev32 v6.8h, v6.8h - eor v26.16b, v19.16b, v26.16b - add v4.4s, v4.4s, v7.4s - rev32 v5.8h, v5.8h - add v3.4s, v3.4s, v6.4s - rev32 v26.8h, v26.8h - eor v25.16b, v4.16b, v25.16b - add v23.4s, v23.4s, v5.4s - eor v0.16b, v3.16b, v0.16b - add v16.4s, v16.4s, v26.4s - ushr v29.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - ushr v31.4s, v0.4s, #12 + eor v17.16b, v17.16b, v7.16b + add v19.4s, v19.4s, v12.4s + add v3.4s, v3.4s, v0.4s + eor v4.16b, v4.16b, v18.16b + rev32 v17.8h, v17.8h + orr v1.16b, v1.16b, v30.16b + rev32 v4.8h, v4.8h + add v19.4s, v19.4s, v1.4s + eor v24.16b, v3.16b, v24.16b + rev32 v24.8h, v24.8h + eor v20.16b, v19.16b, v20.16b + add v5.4s, v5.4s, v17.4s + rev32 v20.8h, v20.8h + add v6.4s, v6.4s, v4.4s + eor v16.16b, v5.16b, v16.16b + add v27.4s, v27.4s, v24.4s + eor v28.16b, v6.16b, v28.16b + ushr v29.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + ushr v30.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v7.4s, v7.4s, v11.4s + add v18.4s, v18.4s, v22.4s + ldr q22, [sp, #144] + add v2.4s, v2.4s, v20.4s + orr v16.16b, v16.16b, v29.16b + eor v0.16b, v27.16b, v0.16b + orr v28.16b, v28.16b, v30.16b + ushr v29.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v1.16b, v23.16b, v1.16b - eor v2.16b, v16.16b, v2.16b - orr v25.16b, v25.16b, v29.16b - orr v0.16b, v0.16b, v31.16b - ushr v29.4s, v1.4s, #12 + add v7.4s, v7.4s, v16.4s + eor v1.16b, v2.16b, v1.16b + add v3.4s, v3.4s, v22.4s + add v18.4s, v18.4s, v28.4s + ushr v30.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - ushr v31.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v18.4s, v18.4s, v21.4s - ldr q21, [sp, #240] - add v20.4s, v20.4s, v27.4s - prfm pldl1keep, [x17, #256] - orr v1.16b, v1.16b, v29.16b - prfm pldl1keep, [x21, #256] - orr v2.16b, v2.16b, v31.16b - prfm pldl1keep, [x16, #256] - add v18.4s, v18.4s, v0.4s - prfm pldl1keep, [x6, #256] - add v17.4s, v17.4s, v21.4s - add v19.4s, v19.4s, v22.4s - add v20.4s, v20.4s, v25.4s - eor v6.16b, v18.16b, v6.16b - add v17.4s, v17.4s, v1.4s - add v19.4s, v19.4s, v2.4s - eor v7.16b, v20.16b, v7.16b - ushr v22.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - eor v5.16b, v17.16b, v5.16b - eor v26.16b, v19.16b, v26.16b - ushr v21.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - orr v6.16b, v6.16b, v22.16b - ushr v22.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - ushr v29.4s, v26.4s, #8 - shl v26.4s, v26.4s, #24 - orr v7.16b, v7.16b, v21.16b - orr v5.16b, v5.16b, v22.16b - add v3.4s, v6.4s, v3.4s - orr v21.16b, v26.16b, v29.16b - add v4.4s, v7.4s, v4.4s - add v22.4s, v5.4s, v23.4s - eor v0.16b, v3.16b, v0.16b - add v16.4s, v21.4s, v16.4s - eor v23.16b, v4.16b, v25.16b - eor v1.16b, v22.16b, v1.16b - ushr v25.4s, v0.4s, #7 + orr v0.16b, v0.16b, v29.16b + ldr q29, [sp, #80] + eor v17.16b, v7.16b, v17.16b + add v19.4s, v19.4s, v21.4s + add v3.4s, v3.4s, v0.4s + eor v4.16b, v18.16b, v4.16b + ushr v21.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + ushr v22.4s, v4.4s, #8 + orr v1.16b, v1.16b, v30.16b + shl v4.4s, v4.4s, #24 + add v19.4s, v19.4s, v1.4s + orr v17.16b, v17.16b, v21.16b + eor v21.16b, v3.16b, v24.16b + orr v4.16b, v4.16b, v22.16b + ushr v22.4s, v21.4s, #8 + shl v21.4s, v21.4s, #24 + eor v20.16b, v19.16b, v20.16b + add v5.4s, v17.4s, v5.4s + ushr v24.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v4.4s, v6.4s + add v22.4s, v21.4s, v27.4s + eor v16.16b, v5.16b, v16.16b + orr v20.16b, v20.16b, v24.16b + ushr v27.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + add v2.4s, v20.4s, v2.4s + eor v24.16b, v6.16b, v28.16b + eor v0.16b, v22.16b, v0.16b + ushr v28.4s, v24.4s, #7 + shl v24.4s, v24.4s, #25 + orr v16.16b, v16.16b, v27.16b + ushr v27.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v2.16b, v16.16b, v2.16b - ushr v26.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - orr v0.16b, v0.16b, v25.16b - ushr v25.4s, v1.4s, #7 + eor v1.16b, v2.16b, v1.16b + add v7.4s, v7.4s, v26.4s + add v18.4s, v18.4s, v10.4s + orr v24.16b, v24.16b, v28.16b + ushr v28.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v29.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v20.4s, v20.4s, v28.4s - orr v23.16b, v23.16b, v26.16b - orr v1.16b, v1.16b, v25.16b - orr v2.16b, v2.16b, v29.16b - add v20.4s, v20.4s, v0.4s - add v18.4s, v18.4s, v13.4s - add v17.4s, v17.4s, v30.4s - add v19.4s, v19.4s, v10.4s - eor v21.16b, v20.16b, v21.16b - add v18.4s, v18.4s, v1.4s - add v17.4s, v17.4s, v2.4s + orr v0.16b, v0.16b, v27.16b + add v3.4s, v3.4s, v25.4s add v19.4s, v19.4s, v23.4s + add v7.4s, v7.4s, v24.4s + add v18.4s, v18.4s, v0.4s + orr v1.16b, v1.16b, v28.16b + add v3.4s, v3.4s, v1.4s + add v19.4s, v19.4s, v16.4s + eor v20.16b, v7.16b, v20.16b + eor v17.16b, v18.16b, v17.16b + rev32 v20.8h, v20.8h + rev32 v17.8h, v17.8h + eor v4.16b, v3.16b, v4.16b + eor v21.16b, v19.16b, v21.16b + rev32 v4.8h, v4.8h rev32 v21.8h, v21.8h - eor v7.16b, v18.16b, v7.16b - eor v6.16b, v17.16b, v6.16b - eor v5.16b, v19.16b, v5.16b - add v22.4s, v22.4s, v21.4s - rev32 v7.8h, v7.8h - rev32 v6.8h, v6.8h - rev32 v5.8h, v5.8h - eor v0.16b, v22.16b, v0.16b - add v16.4s, v16.4s, v7.4s - add v4.4s, v4.4s, v6.4s - add v3.4s, v3.4s, v5.4s - ushr v25.4s, v0.4s, #12 + add v22.4s, v22.4s, v20.4s + add v2.4s, v2.4s, v17.4s + add v5.4s, v5.4s, v4.4s + add v6.4s, v6.4s, v21.4s + eor v24.16b, v22.16b, v24.16b + eor v0.16b, v2.16b, v0.16b + ushr v26.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + ushr v27.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v1.16b, v16.16b, v1.16b - eor v2.16b, v4.16b, v2.16b - eor v23.16b, v3.16b, v23.16b - orr v0.16b, v0.16b, v25.16b - ushr v25.4s, v1.4s, #12 + eor v1.16b, v5.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + add v7.4s, v7.4s, v9.4s + add v18.4s, v18.4s, v8.4s + orr v24.16b, v24.16b, v26.16b + orr v0.16b, v0.16b, v27.16b + ushr v26.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - ushr v26.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ushr v27.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v1.16b, v1.16b, v25.16b - add v20.4s, v20.4s, v24.4s - orr v2.16b, v2.16b, v26.16b - orr v23.16b, v23.16b, v27.16b - add v18.4s, v18.4s, v12.4s - add v17.4s, v17.4s, v9.4s - add v19.4s, v19.4s, v15.4s - add v20.4s, v20.4s, v0.4s - add v18.4s, v18.4s, v1.4s - add v17.4s, v17.4s, v2.4s - add v19.4s, v19.4s, v23.4s - eor v21.16b, v20.16b, v21.16b - eor v7.16b, v18.16b, v7.16b - eor v6.16b, v17.16b, v6.16b - eor v5.16b, v19.16b, v5.16b - ushr v24.4s, v21.4s, #8 + ushr v27.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v3.4s, v3.4s, v15.4s + add v19.4s, v19.4s, v14.4s + add v7.4s, v7.4s, v24.4s + add v18.4s, v18.4s, v0.4s + orr v1.16b, v1.16b, v26.16b + ldr q26, [sp, #64] + orr v16.16b, v16.16b, v27.16b + prfm pldl1keep, [x1, #256] + add v3.4s, v3.4s, v1.4s + prfm pldl1keep, [x6, #256] + add v19.4s, v19.4s, v16.4s + prfm pldl1keep, [x17, #256] + eor v20.16b, v7.16b, v20.16b + prfm pldl1keep, [x0, #256] + eor v17.16b, v18.16b, v17.16b + ushr v23.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + ushr v25.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + eor v4.16b, v3.16b, v4.16b + eor v21.16b, v19.16b, v21.16b + orr v20.16b, v20.16b, v23.16b + orr v17.16b, v17.16b, v25.16b + ushr v23.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + ushr v25.4s, v21.4s, #8 shl v21.4s, v21.4s, #24 - ushr v25.4s, v7.4s, #8 - shl v7.4s, v7.4s, #24 - ushr v26.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - ushr v27.4s, v5.4s, #8 - shl v5.4s, v5.4s, #24 - orr v21.16b, v21.16b, v24.16b - orr v7.16b, v7.16b, v25.16b - orr v6.16b, v6.16b, v26.16b - orr v5.16b, v5.16b, v27.16b - add v22.4s, v21.4s, v22.4s - add v16.4s, v7.4s, v16.4s - add v4.4s, v6.4s, v4.4s - add v3.4s, v5.4s, v3.4s - eor v0.16b, v22.16b, v0.16b - eor v1.16b, v16.16b, v1.16b - eor v2.16b, v4.16b, v2.16b - eor v23.16b, v3.16b, v23.16b - ushr v24.4s, v0.4s, #7 + add v22.4s, v20.4s, v22.4s + add v2.4s, v17.4s, v2.4s + orr v4.16b, v4.16b, v23.16b + orr v21.16b, v21.16b, v25.16b + add v5.4s, v4.4s, v5.4s + add v6.4s, v21.4s, v6.4s + eor v23.16b, v22.16b, v24.16b + eor v0.16b, v2.16b, v0.16b + ushr v24.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ushr v25.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v25.4s, v1.4s, #7 + eor v1.16b, v5.16b, v1.16b + eor v16.16b, v6.16b, v16.16b + orr v23.16b, v23.16b, v24.16b + orr v0.16b, v0.16b, v25.16b + ushr v24.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v26.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ushr v27.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - orr v0.16b, v0.16b, v24.16b - orr v1.16b, v1.16b, v25.16b - orr v2.16b, v2.16b, v26.16b - orr v23.16b, v23.16b, v27.16b - movi v24.4s, #64 - eor v12.16b, v4.16b, v20.16b - eor v31.16b, v18.16b, v3.16b - eor v29.16b, v17.16b, v22.16b - eor v30.16b, v16.16b, v19.16b - eor v28.16b, v7.16b, v23.16b - eor v23.16b, v6.16b, v0.16b - eor v13.16b, v1.16b, v5.16b - eor v25.16b, v2.16b, v21.16b - cbnz x15, .LBB3_5 + ushr v25.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + orr v1.16b, v1.16b, v24.16b + orr v16.16b, v16.16b, v25.16b + eor v7.16b, v5.16b, v7.16b + eor v8.16b, v18.16b, v6.16b + eor v9.16b, v3.16b, v22.16b + eor v25.16b, v2.16b, v19.16b + eor v24.16b, v17.16b, v16.16b + eor v13.16b, v4.16b, v23.16b + eor v14.16b, v0.16b, v21.16b + eor v11.16b, v1.16b, v20.16b + cbnz x16, .LBB3_5 b .LBB3_2 .LBB3_6: - cbz x24, .LBB3_14 - orr w8, w7, w19 - and x22, x5, #0x1 - stur w8, [x29, #-192] + cbz x23, .LBB3_17 + orr w9, w7, w19 + orr w22, w8, w19 + and x28, x5, #0x1 + stur w9, [x29, #-192] + orr w9, w9, w8 + stur w9, [x29, #-208] + b .LBB3_9 .LBB3_8: + ldp q1, q0, [x29, #-176] + add x20, x20, x28 + add x24, x24, #8 + subs x23, x23, #1 + mov x2, x27 + stp q1, q0, [x26], #32 + b.eq .LBB3_17 +.LBB3_9: ldr x8, [sp, #40] - mov x28, x0 - ldr x25, [x0] - mov x23, x2 - ldur w5, [x29, #-192] + mov x27, x2 + ldr x25, [x24] ldp q0, q1, [x8] - mov x8, x2 - b .LBB3_11 -.LBB3_9: - orr w5, w5, w27 -.LBB3_10: + stp q0, q1, [x29, #-176] + cbz x2, .LBB3_8 + ldur w5, [x29, #-192] + mov x21, x27 + cmp x27, #1 + b.ne .LBB3_12 + ldur w5, [x29, #-208] +.LBB3_12: sub x0, x29, #144 sub x1, x29, #176 mov x2, x25 @@ -2022,26 +2159,34 @@ zfs_blake3_hash_many_sse2: bl compress_pre ldp q0, q1, [x29, #-144] add x25, x25, #64 - mov x8, x21 - mov w5, w19 ldp q2, q3, [x29, #-112] + b .LBB3_15 +.LBB3_13: + mov w5, w22 +.LBB3_14: + sub x0, x29, #144 + sub x1, x29, #176 + mov x2, x25 + mov w3, #64 + mov x4, x20 + bl compress_pre + ldp q0, q1, [x29, #-144] + add x25, x25, #64 + sub x21, x21, #1 + ldp q2, q3, [x29, #-112] +.LBB3_15: eor v0.16b, v2.16b, v0.16b + cmp x21, #2 eor v1.16b, v3.16b, v1.16b -.LBB3_11: - subs x21, x8, #1 stp q0, q1, [x29, #-176] - b.eq .LBB3_9 - cbnz x8, .LBB3_10 - ldp q1, q0, [x29, #-176] - mov x0, x28 - add x20, x20, x22 - add x0, x28, #8 - subs x24, x24, #1 - mov x2, x23 - stp q1, q0, [x26], #32 - b.ne .LBB3_8 -.LBB3_14: - add sp, sp, #464 + b.eq .LBB3_13 + mov w5, w19 + cmp x21, #1 + b.ne .LBB3_14 + b .LBB3_8 +.LBB3_17: + add sp, sp, #480 + .cfi_def_cfa wsp, 160 ldp x20, x19, [sp, #144] ldp x22, x21, [sp, #128] ldp x24, x23, [sp, #112] @@ -2052,7 +2197,27 @@ zfs_blake3_hash_many_sse2: ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] ldp d15, d14, [sp], #160 - hint #29 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w20 + .cfi_restore w21 + .cfi_restore w22 + .cfi_restore w23 + .cfi_restore w24 + .cfi_restore w25 + .cfi_restore w26 + .cfi_restore w27 + .cfi_restore w28 + .cfi_restore w30 + .cfi_restore w29 + .cfi_restore b8 + .cfi_restore b9 + .cfi_restore b10 + .cfi_restore b11 + .cfi_restore b12 + .cfi_restore b13 + .cfi_restore b14 + .cfi_restore b15 ret .Lfunc_end3: .size zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2 diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index c4c2dfc5bcde..40417e663565 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -22,7 +22,7 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves - * Copyright (c) 2022-2023 Tino Reichardt + * Copyright (c) 2022-2024 Tino Reichardt * * This is converted assembly: SSE4.1 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde @@ -32,30 +32,17 @@ */ #if defined(__aarch64__) - .text - .section .note.gnu.property,"a",@note - .p2align 3 - .word 4 - .word 16 - .word 5 - .asciz "GNU" - .word 3221225472 - .word 4 - .word 3 - .word 0 -.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse41 .p2align 2 .type zfs_blake3_compress_in_place_sse41,@function zfs_blake3_compress_in_place_sse41: .cfi_startproc - hint #25 - .cfi_negate_ra_state sub sp, sp, #96 + .cfi_def_cfa_offset 96 stp x29, x30, [sp, #64] - add x29, sp, #64 str x19, [sp, #80] + add x29, sp, #64 .cfi_def_cfa w29, 32 .cfi_offset w19, -16 .cfi_offset w30, -24 @@ -72,18 +59,22 @@ zfs_blake3_compress_in_place_sse41: ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b eor v1.16b, v3.16b, v1.16b - ldp x29, x30, [sp, #64] stp q0, q1, [x19] + .cfi_def_cfa wsp, 96 + ldp x29, x30, [sp, #64] ldr x19, [sp, #80] add sp, sp, #96 - hint #29 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w30 + .cfi_restore w29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 + .p2align 4, 0x0 .LCPI1_0: .xword -4942790177982912921 .xword -6534734903820487822 @@ -126,434 +117,559 @@ zfs_blake3_compress_in_place_sse41: .type compress_pre,@function compress_pre: .cfi_startproc - hint #34 - fmov s1, w3 - movi d0, #0x0000ff000000ff - ldr q2, [x1] + fmov s0, w3 + movi d2, #0x0000ff000000ff + ldr q6, [x1] + fmov d7, x4 adrp x8, .LCPI1_0 - mov v1.s[1], w5 - str q2, [x0] + adrp x9, .LCPI1_1 + mov v0.s[1], w5 + str q6, [x0] ldr q4, [x8, :lo12:.LCPI1_0] - ldr q5, [x1, #16] - adrp x8, .LCPI1_1 - and v0.8b, v1.8b, v0.8b - fmov d1, x4 - stp q5, q4, [x0, #16] - mov v1.d[1], v0.d[0] - str q1, [x0, #48] - ldp q6, q7, [x2] - uzp1 v3.4s, v6.4s, v7.4s - add v0.4s, v2.4s, v3.4s - uzp2 v2.4s, v6.4s, v7.4s - add v16.4s, v0.4s, v5.4s - ldr q0, [x8, :lo12:.LCPI1_1] + add x8, x2, #32 + ldr q1, [x1, #16] + and v0.8b, v0.8b, v2.8b + mov v7.d[1], v0.d[0] + stp q1, q4, [x0, #16] + add v20.4s, v6.4s, v1.4s + str q7, [x0, #48] + ldp q0, q19, [x2] + uzp1 v5.4s, v0.4s, v0.4s + dup v16.4s, v0.s[1] + ld2 { v2.4s, v3.4s }, [x8] adrp x8, .LCPI1_2 - eor v1.16b, v16.16b, v1.16b - add v7.4s, v16.4s, v2.4s - tbl v1.16b, { v1.16b }, v0.16b - add v4.4s, v1.4s, v4.4s - eor v5.16b, v4.16b, v5.16b - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b - add v6.4s, v7.4s, v5.4s - eor v7.16b, v1.16b, v6.16b + mov v16.s[1], v0.s[3] + zip1 v21.4s, v5.4s, v19.4s + dup v17.4s, v5.s[1] + ldr q0, [x9, :lo12:.LCPI1_1] + uzp2 v22.4s, v16.4s, v19.4s + uzp1 v21.4s, v21.4s, v19.4s + dup v6.4s, v3.s[3] + dup v18.4s, v2.s[3] + ext v23.16b, v6.16b, v6.16b, #4 + add v24.4s, v20.4s, v21.4s + mov v22.d[0], v16.d[0] + eor v7.16b, v24.16b, v7.16b + tbl v25.16b, { v7.16b }, v0.16b + ext v20.16b, v23.16b, v3.16b, #12 + add v23.4s, v24.4s, v22.4s + dup v26.4s, v21.s[3] + add v24.4s, v25.4s, v4.4s + mov v5.s[0], v19.s[0] + eor v7.16b, v24.16b, v1.16b ldr q1, [x8, :lo12:.LCPI1_2] - add x8, x2, #32 - tbl v7.16b, { v7.16b }, v1.16b - ld2 { v16.4s, v17.4s }, [x8] - add v4.4s, v4.4s, v7.4s - ext v7.16b, v7.16b, v7.16b, #8 - add v6.4s, v6.4s, v16.4s - eor v5.16b, v4.16b, v5.16b - ext v4.16b, v4.16b, v4.16b, #4 - ext v16.16b, v16.16b, v16.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ushr v18.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v18.16b - ext v18.16b, v17.16b, v17.16b, #12 - add v6.4s, v6.4s, v5.4s - mov v17.16b, v18.16b - eor v7.16b, v7.16b, v6.16b - add v6.4s, v6.4s, v18.4s - mov v17.s[1], v16.s[2] - tbl v7.16b, { v7.16b }, v0.16b - add v4.4s, v4.4s, v7.4s - eor v5.16b, v4.16b, v5.16b - ushr v19.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v19.16b - uzp1 v19.4s, v3.4s, v3.4s - add v6.4s, v6.4s, v5.4s - ext v19.16b, v19.16b, v3.16b, #8 - eor v7.16b, v7.16b, v6.16b - uzp2 v19.4s, v19.4s, v2.4s - tbl v7.16b, { v7.16b }, v1.16b - add v6.4s, v6.4s, v19.4s - add v4.4s, v4.4s, v7.4s - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #8 - eor v5.16b, v4.16b, v5.16b - ext v4.16b, v4.16b, v4.16b, #12 - ushr v20.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v20.16b - ext v20.16b, v3.16b, v3.16b, #12 - add v6.4s, v6.4s, v5.4s - ext v3.16b, v3.16b, v20.16b, #12 - eor v7.16b, v7.16b, v6.16b - rev64 v3.4s, v3.4s - tbl v7.16b, { v7.16b }, v0.16b - trn2 v3.4s, v3.4s, v17.4s - add v4.4s, v4.4s, v7.4s - add v6.4s, v6.4s, v3.4s - eor v5.16b, v4.16b, v5.16b - ushr v17.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v17.16b - zip1 v17.2d, v18.2d, v2.2d - zip2 v2.4s, v2.4s, v18.4s - add v6.4s, v6.4s, v5.4s - mov v17.s[3], v16.s[3] - zip1 v18.4s, v2.4s, v16.4s - zip1 v2.4s, v16.4s, v2.4s - eor v7.16b, v7.16b, v6.16b - ext v6.16b, v6.16b, v6.16b, #12 - ext v16.16b, v2.16b, v18.16b, #8 - tbl v7.16b, { v7.16b }, v1.16b - add v20.4s, v4.4s, v7.4s - ext v4.16b, v17.16b, v17.16b, #12 - ext v7.16b, v7.16b, v7.16b, #8 - eor v5.16b, v20.16b, v5.16b - uzp1 v4.4s, v17.4s, v4.4s - ushr v17.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v5.16b, v5.16b, v17.16b - ext v17.16b, v20.16b, v20.16b, #4 - add v6.4s, v6.4s, v5.4s - eor v7.16b, v7.16b, v6.16b - add v6.4s, v6.4s, v16.4s - tbl v7.16b, { v7.16b }, v0.16b - add v17.4s, v17.4s, v7.4s - eor v5.16b, v17.16b, v5.16b - ushr v2.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v2.16b, v5.16b, v2.16b - add v5.4s, v6.4s, v2.4s - ext v6.16b, v19.16b, v19.16b, #4 - eor v7.16b, v7.16b, v5.16b - uzp1 v18.4s, v6.4s, v6.4s - tbl v7.16b, { v7.16b }, v1.16b - ext v18.16b, v18.16b, v6.16b, #8 - add v17.4s, v17.4s, v7.4s - uzp2 v18.4s, v18.4s, v3.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v2.16b, v17.16b, v2.16b - add v5.4s, v5.4s, v18.4s - ext v17.16b, v17.16b, v17.16b, #12 - ushr v19.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v2.16b, v2.16b, v19.16b - ext v19.16b, v6.16b, v6.16b, #12 - add v5.4s, v5.4s, v2.4s - ext v6.16b, v6.16b, v19.16b, #12 - mov v19.16b, v16.16b - eor v7.16b, v7.16b, v5.16b - rev64 v6.4s, v6.4s - mov v19.s[1], v4.s[2] - tbl v7.16b, { v7.16b }, v0.16b - add v17.4s, v17.4s, v7.4s - eor v20.16b, v17.16b, v2.16b - trn2 v2.4s, v6.4s, v19.4s - ushr v6.4s, v20.4s, #12 - shl v19.4s, v20.4s, #20 - add v5.4s, v5.4s, v2.4s - orr v6.16b, v19.16b, v6.16b - add v19.4s, v5.4s, v6.4s - eor v5.16b, v7.16b, v19.16b - zip1 v7.2d, v16.2d, v3.2d - zip2 v3.4s, v3.4s, v16.4s - tbl v20.16b, { v5.16b }, v1.16b - mov v7.s[3], v4.s[3] - add v17.4s, v17.4s, v20.4s - ext v5.16b, v7.16b, v7.16b, #12 - eor v6.16b, v17.16b, v6.16b - uzp1 v5.4s, v7.4s, v5.4s - ext v7.16b, v19.16b, v19.16b, #12 - ext v17.16b, v17.16b, v17.16b, #4 - ushr v19.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v7.4s, v7.4s, v5.4s - orr v6.16b, v6.16b, v19.16b - ext v19.16b, v20.16b, v20.16b, #8 - add v7.4s, v7.4s, v6.4s - eor v19.16b, v19.16b, v7.16b + ushr v27.4s, v7.4s, #12 + shl v28.4s, v7.4s, #20 + uzp1 v7.4s, v26.4s, v21.4s + orr v21.16b, v28.16b, v27.16b + add v23.4s, v23.4s, v21.4s + ext v19.16b, v18.16b, v18.16b, #4 + dup v27.4s, v23.s[3] + eor v25.16b, v25.16b, v23.16b + tbl v25.16b, { v25.16b }, v1.16b + ext v27.16b, v27.16b, v27.16b, #4 + ext v19.16b, v19.16b, v2.16b, #12 + uzp2 v4.4s, v5.4s, v22.4s + add v24.4s, v24.4s, v25.4s + ext v28.16b, v25.16b, v25.16b, #8 + ext v23.16b, v27.16b, v23.16b, #12 + eor v21.16b, v24.16b, v21.16b + dup v25.4s, v25.s[2] + ushr v29.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v27.16b, v24.16b, v24.16b, #4 + dup v24.4s, v24.s[1] + mov v28.s[0], v25.s[0] + add v23.4s, v23.4s, v19.4s + orr v21.16b, v21.16b, v29.16b + add v23.4s, v23.4s, v21.4s + mov v27.s[0], v24.s[0] + eor v24.16b, v28.16b, v23.16b + zip2 v22.4s, v22.4s, v20.4s + tbl v24.16b, { v24.16b }, v0.16b + dup v26.4s, v19.s[2] + zip1 v16.2d, v20.2d, v16.2d + zip1 v18.4s, v18.4s, v22.4s + add v22.4s, v27.4s, v24.4s + mov v26.s[3], v20.s[3] + eor v21.16b, v22.16b, v21.16b + ushr v28.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + mov v4.d[0], v5.d[0] + add v20.4s, v23.4s, v20.4s + orr v21.16b, v21.16b, v28.16b + add v20.4s, v20.4s, v21.4s + mov v27.16b, v16.16b + ext v25.16b, v4.16b, v4.16b, #4 + mov v27.s[3], v19.s[3] + eor v19.16b, v24.16b, v20.16b + ext v24.16b, v20.16b, v20.16b, #4 + tbl v19.16b, { v19.16b }, v1.16b + dup v20.4s, v20.s[1] + mov v23.16b, v25.16b + mov v23.s[0], v17.s[0] + mov v24.s[0], v20.s[0] + add v20.4s, v22.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #8 + eor v21.16b, v20.16b, v21.16b + dup v19.4s, v19.s[2] + dup v28.4s, v20.s[3] + ushr v29.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v24.4s, v24.4s, v23.4s + mov v22.s[0], v19.s[0] + ext v28.16b, v28.16b, v28.16b, #4 + orr v21.16b, v21.16b, v29.16b + add v24.4s, v24.4s, v21.4s + rev64 v5.4s, v7.4s + eor v19.16b, v22.16b, v24.16b + ext v20.16b, v28.16b, v20.16b, #12 + tbl v22.16b, { v19.16b }, v0.16b + trn2 v19.4s, v5.4s, v26.4s + ext v28.16b, v18.16b, v2.16b, #4 + ext v5.16b, v27.16b, v27.16b, #12 + add v20.4s, v20.4s, v22.4s + add v24.4s, v24.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v29.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + ext v18.16b, v28.16b, v18.16b, #8 + dup v28.4s, v23.s[2] + orr v21.16b, v21.16b, v29.16b + add v24.4s, v24.4s, v21.4s + mov v28.s[1], v25.s[1] + dup v25.4s, v24.s[3] + eor v22.16b, v22.16b, v24.16b + tbl v22.16b, { v22.16b }, v1.16b + ext v25.16b, v25.16b, v25.16b, #4 + uzp1 v27.4s, v27.4s, v5.4s + uzp2 v5.4s, v28.4s, v26.4s + add v20.4s, v20.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #8 + ext v24.16b, v25.16b, v24.16b, #12 + eor v21.16b, v20.16b, v21.16b + dup v22.4s, v22.s[2] + ushr v29.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v25.16b, v20.16b, v20.16b, #4 + dup v20.4s, v20.s[1] + mov v26.s[0], v22.s[0] + add v22.4s, v24.4s, v27.4s + orr v21.16b, v21.16b, v29.16b + add v24.4s, v22.4s, v21.4s + mov v25.s[0], v20.s[0] + eor v20.16b, v26.16b, v24.16b + tbl v26.16b, { v20.16b }, v0.16b + mov v5.d[0], v28.d[0] + mov v4.s[2], v17.s[0] + rev64 v28.4s, v18.4s + add v17.4s, v25.4s, v26.4s + ext v22.16b, v5.16b, v5.16b, #4 + eor v20.16b, v17.16b, v21.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + dup v18.4s, v23.s[1] + add v24.4s, v24.4s, v28.4s + dup v23.4s, v27.s[2] + orr v21.16b, v20.16b, v21.16b + add v24.4s, v24.4s, v21.4s + mov v20.16b, v22.16b + eor v25.16b, v26.16b, v24.16b + ext v26.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v1.16b + dup v24.4s, v24.s[1] + mov v20.s[0], v18.s[0] + mov v26.s[0], v24.s[0] + add v17.4s, v17.4s, v25.4s + ext v24.16b, v25.16b, v25.16b, #8 + eor v21.16b, v17.16b, v21.16b + dup v25.4s, v25.s[2] + dup v29.4s, v17.s[3] + ushr v30.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v26.4s, v26.4s, v20.4s + mov v24.s[0], v25.s[0] + ext v29.16b, v29.16b, v29.16b, #4 + orr v21.16b, v21.16b, v30.16b + add v25.4s, v26.4s, v21.4s + mov v23.s[3], v28.s[3] + eor v24.16b, v24.16b, v25.16b + ext v26.16b, v29.16b, v17.16b, #12 + zip1 v17.2d, v28.2d, v19.2d + rev64 v19.4s, v4.4s + tbl v24.16b, { v24.16b }, v0.16b + zip2 v28.4s, v7.4s, v28.4s + mov v29.16b, v17.16b + trn2 v19.4s, v19.4s, v23.4s + add v26.4s, v26.4s, v24.4s + mov v29.s[3], v27.s[3] + eor v21.16b, v26.16b, v21.16b + ushr v27.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + dup v7.4s, v16.s[2] + add v25.4s, v25.4s, v19.4s + orr v27.16b, v21.16b, v27.16b + add v25.4s, v25.4s, v27.4s + ext v21.16b, v29.16b, v29.16b, #12 + dup v30.4s, v25.s[3] + eor v24.16b, v24.16b, v25.16b + tbl v24.16b, { v24.16b }, v1.16b + ext v30.16b, v30.16b, v30.16b, #4 + uzp1 v21.4s, v29.4s, v21.4s + zip1 v6.4s, v6.4s, v28.4s + add v26.4s, v26.4s, v24.4s + ext v29.16b, v24.16b, v24.16b, #8 + ext v25.16b, v30.16b, v25.16b, #12 + eor v27.16b, v26.16b, v27.16b + dup v24.4s, v24.s[2] + ushr v30.4s, v27.4s, #7 + shl v27.4s, v27.4s, #25 + mov v29.s[0], v24.s[0] + dup v16.4s, v26.s[1] + ext v24.16b, v26.16b, v26.16b, #4 + add v25.4s, v25.4s, v21.4s + orr v27.16b, v27.16b, v30.16b + ext v26.16b, v6.16b, v7.16b, #4 + add v25.4s, v25.4s, v27.4s + mov v24.s[0], v16.s[0] + eor v16.16b, v29.16b, v25.16b + dup v28.4s, v20.s[2] + tbl v16.16b, { v16.16b }, v0.16b + ext v6.16b, v26.16b, v6.16b, #8 + mov v28.s[1], v22.s[1] + mov v5.s[2], v18.s[0] + add v24.4s, v24.4s, v16.4s + rev64 v29.4s, v6.4s + uzp2 v6.4s, v28.4s, v23.4s + eor v26.16b, v24.16b, v27.16b + ushr v27.4s, v26.4s, #12 + shl v26.4s, v26.4s, #20 + dup v18.4s, v21.s[2] + add v25.4s, v25.4s, v29.4s + orr v22.16b, v26.16b, v27.16b + mov v6.d[0], v28.d[0] + add v23.4s, v25.4s, v22.4s + mov v18.s[3], v29.s[3] + ext v26.16b, v23.16b, v23.16b, #4 + eor v16.16b, v16.16b, v23.16b + ext v27.16b, v6.16b, v6.16b, #4 + tbl v25.16b, { v16.16b }, v1.16b + dup v16.4s, v23.s[1] + zip1 v19.2d, v29.2d, v19.2d + mov v26.s[0], v16.s[0] + dup v16.4s, v20.s[1] + mov v23.16b, v27.16b + add v24.4s, v24.4s, v25.4s + ext v20.16b, v25.16b, v25.16b, #8 + mov v23.s[0], v16.s[0] + eor v22.16b, v24.16b, v22.16b + dup v25.4s, v25.s[2] + ushr v28.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + dup v30.4s, v24.s[3] + mov v20.s[0], v25.s[0] + add v26.4s, v26.4s, v23.4s + ext v25.16b, v30.16b, v30.16b, #4 + orr v22.16b, v22.16b, v28.16b + add v26.4s, v26.4s, v22.4s + mov v6.s[2], v16.s[0] + eor v20.16b, v20.16b, v26.16b + ext v24.16b, v25.16b, v24.16b, #12 + tbl v25.16b, { v20.16b }, v0.16b + rev64 v20.4s, v5.4s + rev64 v16.4s, v6.4s + add v24.4s, v24.4s, v25.4s + trn2 v20.4s, v20.4s, v18.4s + eor v22.16b, v24.16b, v22.16b + ushr v28.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v26.4s, v26.4s, v20.4s + orr v22.16b, v22.16b, v28.16b + add v26.4s, v26.4s, v22.4s + mov v28.16b, v19.16b + eor v25.16b, v25.16b, v26.16b + mov v28.s[3], v21.s[3] + tbl v25.16b, { v25.16b }, v1.16b + dup v30.4s, v26.s[3] + ext v21.16b, v30.16b, v30.16b, #4 + ext v30.16b, v28.16b, v28.16b, #12 + add v24.4s, v24.4s, v25.4s + ext v31.16b, v25.16b, v25.16b, #8 + eor v22.16b, v24.16b, v22.16b + ext v26.16b, v21.16b, v26.16b, #12 + uzp1 v21.4s, v28.4s, v30.4s + ushr v28.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + dup v25.4s, v25.s[2] + orr v22.16b, v22.16b, v28.16b + zip2 v28.4s, v4.4s, v29.4s + mov v31.s[0], v25.s[0] + ext v25.16b, v24.16b, v24.16b, #4 + add v26.4s, v26.4s, v21.4s + zip1 v2.4s, v2.4s, v28.4s + dup v4.4s, v17.s[2] + dup v17.4s, v24.s[1] + add v24.4s, v26.4s, v22.4s + dup v26.4s, v23.s[2] + mov v25.s[0], v17.s[0] + ext v17.16b, v2.16b, v4.16b, #4 + eor v3.16b, v31.16b, v24.16b + tbl v3.16b, { v3.16b }, v0.16b + mov v26.s[1], v27.s[1] + ext v2.16b, v17.16b, v2.16b, #8 + add v25.4s, v25.4s, v3.4s + rev64 v28.4s, v2.4s + eor v17.16b, v25.16b, v22.16b + uzp2 v2.4s, v26.4s, v18.4s + ushr v22.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v22.16b + add v22.4s, v24.4s, v28.4s + mov v2.d[0], v26.d[0] + add v18.4s, v22.4s, v17.4s + eor v3.16b, v3.16b, v18.16b + ext v26.16b, v18.16b, v18.16b, #4 + tbl v3.16b, { v3.16b }, v1.16b + ext v22.16b, v2.16b, v2.16b, #4 + dup v18.4s, v18.s[1] + mov v26.s[0], v18.s[0] + dup v18.4s, v23.s[1] + add v25.4s, v25.4s, v3.4s + mov v24.16b, v22.16b + ext v23.16b, v3.16b, v3.16b, #8 + mov v24.s[0], v18.s[0] + dup v3.4s, v3.s[2] + dup v29.4s, v25.s[3] + eor v17.16b, v25.16b, v17.16b + ushr v27.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + mov v23.s[0], v3.s[0] + ext v3.16b, v29.16b, v29.16b, #4 + add v26.4s, v26.4s, v24.4s + orr v17.16b, v17.16b, v27.16b + add v26.4s, v26.4s, v17.4s + ext v3.16b, v3.16b, v25.16b, #12 + dup v25.4s, v21.s[2] + eor v23.16b, v23.16b, v26.16b + tbl v23.16b, { v23.16b }, v0.16b + mov v25.s[3], v28.s[3] + mov v2.s[2], v18.s[0] + add v3.4s, v3.4s, v23.4s + trn2 v16.4s, v16.4s, v25.4s + eor v17.16b, v3.16b, v17.16b + ushr v27.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v26.4s, v26.4s, v16.4s + orr v27.16b, v17.16b, v27.16b + add v26.4s, v26.4s, v27.4s + eor v17.16b, v23.16b, v26.16b + tbl v23.16b, { v17.16b }, v1.16b + zip1 v17.2d, v28.2d, v20.2d + add v20.4s, v3.4s, v23.4s + mov v3.16b, v17.16b + mov v3.s[3], v21.s[3] + eor v27.16b, v20.16b, v27.16b + dup v21.4s, v26.s[3] + ushr v29.4s, v27.4s, #7 + shl v27.4s, v27.4s, #25 + ext v21.16b, v21.16b, v21.16b, #4 + orr v27.16b, v27.16b, v29.16b + ext v29.16b, v3.16b, v3.16b, #12 + ext v30.16b, v23.16b, v23.16b, #8 + ext v26.16b, v21.16b, v26.16b, #12 + dup v23.4s, v23.s[2] + uzp1 v21.4s, v3.4s, v29.4s + zip2 v3.4s, v5.4s, v28.4s + mov v30.s[0], v23.s[0] + ext v5.16b, v20.16b, v20.16b, #4 + add v23.4s, v26.4s, v21.4s + zip1 v7.4s, v7.4s, v3.4s + dup v3.4s, v19.s[2] + dup v19.4s, v20.s[1] + add v20.4s, v23.4s, v27.4s + ext v23.16b, v7.16b, v3.16b, #4 + mov v5.s[0], v19.s[0] + eor v19.16b, v30.16b, v20.16b tbl v19.16b, { v19.16b }, v0.16b - add v16.4s, v17.4s, v19.4s - zip1 v17.4s, v3.4s, v4.4s - zip1 v3.4s, v4.4s, v3.4s - eor v4.16b, v16.16b, v6.16b - ext v17.16b, v3.16b, v17.16b, #8 - ushr v3.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v6.4s, v7.4s, v17.4s - orr v3.16b, v4.16b, v3.16b - add v4.4s, v6.4s, v3.4s - ext v6.16b, v18.16b, v18.16b, #4 - eor v7.16b, v19.16b, v4.16b - uzp1 v18.4s, v6.4s, v6.4s - tbl v7.16b, { v7.16b }, v1.16b - ext v18.16b, v18.16b, v6.16b, #8 - add v16.4s, v16.4s, v7.4s - uzp2 v18.4s, v18.4s, v2.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v3.16b, v16.16b, v3.16b - add v4.4s, v4.4s, v18.4s - ext v16.16b, v16.16b, v16.16b, #12 - ushr v19.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v3.16b, v3.16b, v19.16b - ext v19.16b, v6.16b, v6.16b, #12 - add v4.4s, v4.4s, v3.4s - ext v6.16b, v6.16b, v19.16b, #12 - mov v19.16b, v17.16b - eor v7.16b, v7.16b, v4.16b - rev64 v6.4s, v6.4s - mov v19.s[1], v5.s[2] - tbl v7.16b, { v7.16b }, v0.16b - add v16.4s, v16.4s, v7.4s - eor v20.16b, v16.16b, v3.16b - trn2 v3.4s, v6.4s, v19.4s - ushr v6.4s, v20.4s, #12 - shl v19.4s, v20.4s, #20 - add v4.4s, v4.4s, v3.4s - orr v6.16b, v19.16b, v6.16b - zip1 v19.2d, v17.2d, v2.2d - zip2 v2.4s, v2.4s, v17.4s - add v4.4s, v4.4s, v6.4s - mov v19.s[3], v5.s[3] - zip1 v17.4s, v2.4s, v5.4s - zip1 v2.4s, v5.4s, v2.4s - eor v7.16b, v7.16b, v4.16b - ext v20.16b, v19.16b, v19.16b, #12 - ext v4.16b, v4.16b, v4.16b, #12 - ext v2.16b, v2.16b, v17.16b, #8 - tbl v7.16b, { v7.16b }, v1.16b - add v16.4s, v16.4s, v7.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v21.16b, v16.16b, v6.16b - uzp1 v6.4s, v19.4s, v20.4s - ext v16.16b, v16.16b, v16.16b, #4 - ushr v19.4s, v21.4s, #7 - shl v20.4s, v21.4s, #25 - add v4.4s, v4.4s, v6.4s - orr v19.16b, v20.16b, v19.16b - add v4.4s, v4.4s, v19.4s - eor v7.16b, v7.16b, v4.16b - add v4.4s, v4.4s, v2.4s - tbl v7.16b, { v7.16b }, v0.16b - add v16.4s, v16.4s, v7.4s - eor v5.16b, v16.16b, v19.16b - ushr v17.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v17.16b - ext v17.16b, v18.16b, v18.16b, #4 - add v4.4s, v4.4s, v5.4s - uzp1 v18.4s, v17.4s, v17.4s - eor v7.16b, v7.16b, v4.16b - ext v18.16b, v18.16b, v17.16b, #8 - tbl v7.16b, { v7.16b }, v1.16b - uzp2 v18.4s, v18.4s, v3.4s - add v16.4s, v16.4s, v7.4s - add v4.4s, v4.4s, v18.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v5.16b, v16.16b, v5.16b - ext v4.16b, v4.16b, v4.16b, #4 - ext v16.16b, v16.16b, v16.16b, #12 - ushr v19.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v19.16b - add v19.4s, v4.4s, v5.4s - eor v4.16b, v7.16b, v19.16b - ext v7.16b, v17.16b, v17.16b, #12 - tbl v20.16b, { v4.16b }, v0.16b - ext v4.16b, v17.16b, v7.16b, #12 - mov v7.16b, v2.16b - add v16.4s, v16.4s, v20.4s + ext v7.16b, v23.16b, v7.16b, #8 + add v26.4s, v5.4s, v19.4s + dup v5.4s, v24.s[2] + rev64 v23.4s, v7.4s + eor v7.16b, v26.16b, v27.16b + ushr v24.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + add v20.4s, v20.4s, v23.4s + mov v27.16b, v5.16b + orr v7.16b, v7.16b, v24.16b + mov v27.s[1], v22.s[1] + add v20.4s, v20.4s, v7.4s + zip1 v16.2d, v23.2d, v16.2d + eor v19.16b, v19.16b, v20.16b + tbl v22.16b, { v19.16b }, v1.16b + uzp2 v19.4s, v27.4s, v25.4s + ext v25.16b, v20.16b, v20.16b, #4 + zip2 v6.4s, v6.4s, v23.4s + add v24.4s, v26.4s, v22.4s + mov v19.d[0], v27.d[0] + dup v26.4s, v20.s[1] + dup v28.4s, v24.s[3] + mov v25.s[0], v26.s[0] + ext v20.16b, v19.16b, v19.16b, #4 + dup v19.4s, v19.s[1] + ext v26.16b, v22.16b, v22.16b, #8 + dup v22.4s, v22.s[2] + eor v7.16b, v24.16b, v7.16b + mov v20.s[0], v19.s[0] + ushr v27.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + mov v26.s[0], v22.s[0] + ext v22.16b, v28.16b, v28.16b, #4 + add v25.4s, v25.4s, v20.4s + orr v27.16b, v7.16b, v27.16b + add v25.4s, v25.4s, v27.4s + ext v7.16b, v22.16b, v24.16b, #12 + dup v24.4s, v21.s[2] + eor v22.16b, v26.16b, v25.16b + tbl v18.16b, { v22.16b }, v0.16b + mov v24.s[3], v23.s[3] + rev64 v22.4s, v2.4s + zip1 v4.4s, v4.4s, v6.4s + add v26.4s, v7.4s, v18.4s + dup v6.4s, v17.s[2] + trn2 v7.4s, v22.4s, v24.4s + eor v22.16b, v26.16b, v27.16b + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v25.4s, v25.4s, v7.4s + orr v22.16b, v22.16b, v27.16b + add v25.4s, v25.4s, v22.4s + mov v27.16b, v16.16b + eor v18.16b, v18.16b, v25.16b + tbl v18.16b, { v18.16b }, v1.16b + mov v27.s[3], v21.s[3] + dup v29.4s, v25.s[3] + ext v6.16b, v4.16b, v6.16b, #4 + add v26.4s, v26.4s, v18.4s + ext v21.16b, v29.16b, v29.16b, #4 + eor v22.16b, v26.16b, v22.16b + ushr v28.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + dup v29.4s, v18.s[2] + ext v21.16b, v21.16b, v25.16b, #12 + dup v17.4s, v26.s[1] + orr v22.16b, v22.16b, v28.16b + ext v28.16b, v27.16b, v27.16b, #12 + ext v25.16b, v18.16b, v18.16b, #8 + ext v23.16b, v26.16b, v26.16b, #4 + ext v4.16b, v6.16b, v4.16b, #8 + uzp1 v18.4s, v27.4s, v28.4s + mov v25.s[0], v29.s[0] + mov v23.s[0], v17.s[0] rev64 v4.4s, v4.4s - mov v7.s[1], v6.s[2] - eor v5.16b, v16.16b, v5.16b - trn2 v4.4s, v4.4s, v7.4s - ushr v7.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v19.4s, v4.4s - zip1 v19.2d, v2.2d, v3.2d - zip2 v2.4s, v3.4s, v2.4s - orr v5.16b, v5.16b, v7.16b - mov v19.s[3], v6.s[3] - add v7.4s, v17.4s, v5.4s - eor v17.16b, v20.16b, v7.16b - ext v20.16b, v19.16b, v19.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - tbl v17.16b, { v17.16b }, v1.16b - add v16.4s, v16.4s, v17.4s - ext v17.16b, v17.16b, v17.16b, #8 - eor v21.16b, v16.16b, v5.16b - uzp1 v5.4s, v19.4s, v20.4s - ext v16.16b, v16.16b, v16.16b, #4 - ushr v19.4s, v21.4s, #7 - shl v20.4s, v21.4s, #25 - add v7.4s, v7.4s, v5.4s - orr v19.16b, v20.16b, v19.16b - add v7.4s, v7.4s, v19.4s - eor v17.16b, v17.16b, v7.16b + add v21.4s, v21.4s, v18.4s + mov v5.s[2], v19.s[0] + add v21.4s, v21.4s, v22.4s + dup v19.4s, v18.s[2] + eor v17.16b, v25.16b, v21.16b tbl v17.16b, { v17.16b }, v0.16b - add v3.4s, v16.4s, v17.4s - zip1 v16.4s, v2.4s, v6.4s - zip1 v2.4s, v6.4s, v2.4s - eor v6.16b, v3.16b, v19.16b - ext v16.16b, v2.16b, v16.16b, #8 - ushr v2.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v7.4s, v7.4s, v16.4s - orr v2.16b, v6.16b, v2.16b - add v6.4s, v7.4s, v2.4s - ext v7.16b, v18.16b, v18.16b, #4 - eor v17.16b, v17.16b, v6.16b - uzp1 v18.4s, v7.4s, v7.4s + add v21.4s, v21.4s, v4.4s + mov v19.s[3], v4.s[3] + rev64 v5.4s, v5.4s + add v6.4s, v23.4s, v17.4s + zip1 v7.2d, v4.2d, v7.2d + eor v22.16b, v6.16b, v22.16b + ushr v23.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + trn2 v5.4s, v5.4s, v19.4s + orr v22.16b, v22.16b, v23.16b + dup v23.4s, v20.s[2] + add v21.4s, v21.4s, v22.4s + dup v20.4s, v20.s[1] + zip1 v23.4s, v24.4s, v23.4s + eor v17.16b, v17.16b, v21.16b tbl v17.16b, { v17.16b }, v1.16b - ext v18.16b, v18.16b, v7.16b, #8 - add v3.4s, v3.4s, v17.4s - uzp2 v18.4s, v18.4s, v4.4s - eor v2.16b, v3.16b, v2.16b - add v6.4s, v6.4s, v18.4s - ext v3.16b, v3.16b, v3.16b, #12 - ext v18.16b, v18.16b, v18.16b, #4 - ushr v19.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v19.16b, v2.16b, v19.16b - ext v2.16b, v17.16b, v17.16b, #8 - ext v17.16b, v7.16b, v7.16b, #12 - add v6.4s, v6.4s, v19.4s - eor v2.16b, v2.16b, v6.16b - tbl v20.16b, { v2.16b }, v0.16b - ext v2.16b, v7.16b, v17.16b, #12 - mov v7.16b, v16.16b - add v17.4s, v3.4s, v20.4s - rev64 v3.4s, v2.4s - mov v7.s[1], v5.s[2] - eor v19.16b, v17.16b, v19.16b - trn2 v3.4s, v3.4s, v7.4s - ushr v21.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - add v6.4s, v6.4s, v3.4s - orr v19.16b, v19.16b, v21.16b - add v21.4s, v6.4s, v19.4s - eor v6.16b, v20.16b, v21.16b - zip1 v20.2d, v16.2d, v4.2d - zip2 v4.4s, v4.4s, v16.4s - tbl v22.16b, { v6.16b }, v1.16b - mov v20.s[3], v5.s[3] - add v17.4s, v17.4s, v22.4s - ext v6.16b, v20.16b, v20.16b, #12 - eor v19.16b, v17.16b, v19.16b - uzp1 v6.4s, v20.4s, v6.4s - ext v20.16b, v21.16b, v21.16b, #12 - ext v17.16b, v17.16b, v17.16b, #4 - ushr v21.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - add v20.4s, v20.4s, v6.4s - orr v19.16b, v19.16b, v21.16b - ext v21.16b, v22.16b, v22.16b, #8 - add v20.4s, v20.4s, v19.4s - eor v21.16b, v21.16b, v20.16b - tbl v21.16b, { v21.16b }, v0.16b - add v16.4s, v17.4s, v21.4s - zip1 v17.4s, v4.4s, v5.4s - zip1 v4.4s, v5.4s, v4.4s - eor v5.16b, v16.16b, v19.16b - ext v4.16b, v4.16b, v17.16b, #8 - ushr v17.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v19.4s, v20.4s, v4.4s - ext v20.16b, v18.16b, v18.16b, #8 - zip1 v3.2d, v4.2d, v3.2d - orr v5.16b, v5.16b, v17.16b - zip2 v2.4s, v2.4s, v4.4s - uzp2 v7.4s, v20.4s, v7.4s - mov v3.s[3], v6.s[3] - add v17.4s, v19.4s, v5.4s - ext v7.16b, v7.16b, v20.16b, #4 - eor v19.16b, v21.16b, v17.16b - ext v17.16b, v17.16b, v17.16b, #4 - tbl v19.16b, { v19.16b }, v1.16b - add v7.4s, v17.4s, v7.4s - add v16.4s, v16.4s, v19.4s - ext v17.16b, v19.16b, v19.16b, #8 - ext v19.16b, v18.16b, v18.16b, #12 - eor v5.16b, v16.16b, v5.16b - ext v16.16b, v16.16b, v16.16b, #12 - ext v18.16b, v18.16b, v19.16b, #12 - mov v19.16b, v4.16b - ushr v20.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - rev64 v18.4s, v18.4s - mov v19.s[1], v6.s[2] - orr v5.16b, v5.16b, v20.16b - trn2 v18.4s, v18.4s, v19.4s - add v7.4s, v5.4s, v7.4s - eor v17.16b, v17.16b, v7.16b - add v7.4s, v7.4s, v18.4s - ext v18.16b, v3.16b, v3.16b, #12 + mov v7.s[3], v18.s[3] + uzp2 v23.4s, v23.4s, v24.4s + ext v24.16b, v21.16b, v21.16b, #4 + dup v21.4s, v21.s[1] + add v6.4s, v6.4s, v17.4s + ext v23.16b, v23.16b, v23.16b, #4 + mov v24.s[0], v21.s[0] + ext v21.16b, v17.16b, v17.16b, #8 + eor v22.16b, v6.16b, v22.16b + mov v23.s[0], v20.s[0] + dup v17.4s, v17.s[2] + dup v25.4s, v6.s[3] + ushr v20.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + mov v21.s[0], v17.s[0] + ext v17.16b, v25.16b, v25.16b, #4 + add v23.4s, v24.4s, v23.4s + orr v20.16b, v22.16b, v20.16b + add v22.4s, v23.4s, v20.4s + ext v6.16b, v17.16b, v6.16b, #12 + eor v17.16b, v21.16b, v22.16b tbl v17.16b, { v17.16b }, v0.16b - uzp1 v3.4s, v3.4s, v18.4s - add v16.4s, v16.4s, v17.4s - eor v5.16b, v16.16b, v5.16b - ushr v19.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v19.16b - add v7.4s, v7.4s, v5.4s - eor v17.16b, v17.16b, v7.16b - ext v7.16b, v7.16b, v7.16b, #12 + add v5.4s, v22.4s, v5.4s + zip2 v2.4s, v2.4s, v4.4s + add v6.4s, v6.4s, v17.4s + eor v19.16b, v6.16b, v20.16b + ushr v20.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + zip1 v2.4s, v3.4s, v2.4s + dup v3.4s, v16.s[2] + orr v19.16b, v19.16b, v20.16b + add v5.4s, v5.4s, v19.4s + eor v17.16b, v17.16b, v5.16b + dup v21.4s, v5.s[3] tbl v17.16b, { v17.16b }, v1.16b - add v3.4s, v7.4s, v3.4s - add v16.4s, v16.4s, v17.4s - ext v7.16b, v17.16b, v17.16b, #8 - eor v5.16b, v16.16b, v5.16b - ext v16.16b, v16.16b, v16.16b, #4 - ushr v18.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v18.16b - add v3.4s, v3.4s, v5.4s - eor v7.16b, v7.16b, v3.16b - tbl v0.16b, { v7.16b }, v0.16b - zip1 v7.4s, v2.4s, v6.4s - zip1 v2.4s, v6.4s, v2.4s - add v4.4s, v16.4s, v0.4s - ext v2.16b, v2.16b, v7.16b, #8 - eor v5.16b, v4.16b, v5.16b - add v2.4s, v3.4s, v2.4s - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v3.16b, v5.16b, v6.16b - add v2.4s, v2.4s, v3.4s + ext v18.16b, v21.16b, v21.16b, #4 + ext v3.16b, v2.16b, v3.16b, #4 + add v6.4s, v6.4s, v17.4s + ext v5.16b, v18.16b, v5.16b, #12 + eor v19.16b, v6.16b, v19.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v18.16b, v17.16b, v17.16b, #8 + dup v17.4s, v17.s[2] + orr v19.16b, v19.16b, v20.16b + ext v20.16b, v7.16b, v7.16b, #12 + ext v4.16b, v6.16b, v6.16b, #4 + mov v18.s[0], v17.s[0] + dup v6.4s, v6.s[1] + uzp1 v7.4s, v7.4s, v20.4s + mov v4.s[0], v6.s[0] + ext v2.16b, v3.16b, v2.16b, #8 + add v5.4s, v5.4s, v7.4s + add v5.4s, v5.4s, v19.4s + rev64 v2.4s, v2.4s + eor v6.16b, v18.16b, v5.16b + tbl v0.16b, { v6.16b }, v0.16b + add v2.4s, v5.4s, v2.4s + add v3.4s, v4.4s, v0.4s + eor v4.16b, v3.16b, v19.16b + ushr v6.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v6.16b + add v2.4s, v2.4s, v4.4s eor v0.16b, v0.16b, v2.16b - ext v2.16b, v2.16b, v2.16b, #4 tbl v0.16b, { v0.16b }, v1.16b - add v1.4s, v4.4s, v0.4s + add v1.4s, v3.4s, v0.4s + dup v6.4s, v0.s[2] ext v0.16b, v0.16b, v0.16b, #8 - eor v3.16b, v1.16b, v3.16b - ext v1.16b, v1.16b, v1.16b, #12 - ushr v4.4s, v3.4s, #7 + dup v7.4s, v1.s[3] + eor v3.16b, v1.16b, v4.16b + ext v4.16b, v2.16b, v2.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + dup v2.4s, v2.s[1] + mov v0.s[0], v6.s[0] + ushr v5.4s, v3.4s, #7 + mov v4.s[0], v2.s[0] + ext v1.16b, v7.16b, v1.16b, #12 shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v5.16b + stp q4, q3, [x0] stp q1, q0, [x0, #32] - orr v3.16b, v3.16b, v4.16b - stp q2, q3, [x0] ret .Lfunc_end1: .size compress_pre, .Lfunc_end1-compress_pre @@ -564,12 +680,11 @@ compress_pre: .type zfs_blake3_compress_xof_sse41,@function zfs_blake3_compress_xof_sse41: .cfi_startproc - hint #25 - .cfi_negate_ra_state sub sp, sp, #96 + .cfi_def_cfa_offset 96 stp x29, x30, [sp, #64] - add x29, sp, #64 stp x20, x19, [sp, #80] + add x29, sp, #64 .cfi_def_cfa w29, 32 .cfi_offset w19, -8 .cfi_offset w20, -16 @@ -588,7 +703,6 @@ zfs_blake3_compress_xof_sse41: ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b eor v1.16b, v3.16b, v1.16b - ldp x29, x30, [sp, #64] stp q0, q1, [x19] ldr q0, [x20] eor v0.16b, v0.16b, v2.16b @@ -596,16 +710,22 @@ zfs_blake3_compress_xof_sse41: ldr q0, [x20, #16] eor v0.16b, v0.16b, v3.16b str q0, [x19, #48] + .cfi_def_cfa wsp, 96 ldp x20, x19, [sp, #80] + ldp x29, x30, [sp, #64] add sp, sp, #96 - hint #29 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w20 + .cfi_restore w30 + .cfi_restore w29 ret .Lfunc_end2: .size zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 + .p2align 4, 0x0 .LCPI3_0: .word 0 .word 1 @@ -645,29 +765,25 @@ zfs_blake3_compress_xof_sse41: .byte 14 .byte 15 .byte 12 -.LCPI3_3: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 .text .globl zfs_blake3_hash_many_sse41 .p2align 2 .type zfs_blake3_hash_many_sse41,@function zfs_blake3_hash_many_sse41: .cfi_startproc - hint #34 - stp d15, d14, [sp, #-144]! + stp d15, d14, [sp, #-160]! + .cfi_def_cfa_offset 160 stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] - stp x29, x27, [sp, #64] - stp x26, x25, [sp, #80] - stp x24, x23, [sp, #96] - stp x22, x21, [sp, #112] - stp x20, x19, [sp, #128] - sub sp, sp, #368 - .cfi_def_cfa_offset 512 + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + add x29, sp, #64 + .cfi_def_cfa w29, 96 .cfi_offset w19, -8 .cfi_offset w20, -16 .cfi_offset w21, -24 @@ -677,1719 +793,1356 @@ zfs_blake3_hash_many_sse41: .cfi_offset w25, -56 .cfi_offset w26, -64 .cfi_offset w27, -72 - .cfi_offset w29, -80 - .cfi_offset b8, -88 - .cfi_offset b9, -96 - .cfi_offset b10, -104 - .cfi_offset b11, -112 - .cfi_offset b12, -120 - .cfi_offset b13, -128 - .cfi_offset b14, -136 - .cfi_offset b15, -144 - ldr x8, [sp, #520] - adrp x11, .LCPI3_1 - ldrb w9, [sp, #512] - adrp x10, .LCPI3_2 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + sub sp, sp, #432 + mov w19, w6 + mov x20, x4 + mov x21, x3 + mov x3, x2 + mov x23, x1 + mov x24, x0 + ldr x26, [x29, #104] cmp x1, #4 - b.lo .LBB3_6 - adrp x12, .LCPI3_0 - sbfx w13, w5, #0, #1 - mov w15, #58983 - mov w16, #44677 - movk w15, #27145, lsl #16 - movk w16, #47975, lsl #16 - ldr q0, [x12, :lo12:.LCPI3_0] - dup v1.4s, w13 - movi v13.4s, #64 + ldrb w8, [x29, #96] + str x2, [sp, #40] + b.lo .LBB3_7 + adrp x10, .LCPI3_0 + sbfx w11, w5, #0, #1 + mov w12, #44677 mov w13, #62322 mov w14, #62778 - orr w12, w7, w6 - and v0.16b, v1.16b, v0.16b - ldr q1, [x11, :lo12:.LCPI3_1] + orr w9, w7, w19 + ldr q0, [x10, :lo12:.LCPI3_0] + dup v1.4s, w11 + mov w11, #58983 + adrp x10, .LCPI3_1 + movk w11, #27145, lsl #16 + movk w12, #47975, lsl #16 movk w13, #15470, lsl #16 movk w14, #42319, lsl #16 - dup v14.4s, w15 - stp q0, q1, [sp, #16] + adrp x15, .LCPI3_2 + and v0.16b, v1.16b, v0.16b + str q0, [sp, #16] orr v0.4s, #128, lsl #24 str q0, [sp] - dup v0.4s, w16 - stp q0, q14, [sp, #48] b .LBB3_3 .LBB3_2: - zip1 v0.4s, v29.4s, v8.4s - add x15, x4, #4 - zip1 v1.4s, v30.4s, v31.4s + zip1 v0.4s, v27.4s, v29.4s + add x16, x20, #4 + zip1 v1.4s, v16.4s, v28.4s tst w5, #0x1 - zip1 v2.4s, v24.4s, v18.4s - csel x4, x15, x4, ne - zip1 v3.4s, v25.4s, v26.4s - add x0, x0, #32 - zip2 v6.4s, v29.4s, v8.4s - sub x1, x1, #4 + zip1 v2.4s, v22.4s, v7.4s + csel x20, x16, x20, ne + zip1 v3.4s, v11.4s, v13.4s + add x24, x24, #32 + zip2 v6.4s, v27.4s, v29.4s + sub x23, x23, #4 zip1 v4.2d, v0.2d, v1.2d - cmp x1, #3 - zip2 v7.4s, v30.4s, v31.4s + cmp x23, #3 + zip2 v16.4s, v16.4s, v28.4s zip1 v5.2d, v2.2d, v3.2d zip2 v0.2d, v0.2d, v1.2d zip2 v1.2d, v2.2d, v3.2d - zip2 v2.4s, v24.4s, v18.4s - zip2 v3.4s, v25.4s, v26.4s - stp q4, q5, [x8] - zip2 v4.2d, v6.2d, v7.2d - stp q0, q1, [x8, #32] - zip1 v0.2d, v6.2d, v7.2d + zip2 v2.4s, v22.4s, v7.4s + zip2 v3.4s, v11.4s, v13.4s + stp q4, q5, [x26] + zip2 v4.2d, v6.2d, v16.2d + stp q0, q1, [x26, #32] + zip1 v0.2d, v6.2d, v16.2d zip1 v1.2d, v2.2d, v3.2d zip2 v2.2d, v2.2d, v3.2d - stp q0, q1, [x8, #64] - stp q4, q2, [x8, #96] - add x8, x8, #128 - b.ls .LBB3_6 + stp q0, q1, [x26, #64] + stp q4, q2, [x26, #96] + add x26, x26, #128 + b.ls .LBB3_7 .LBB3_3: - mov x15, x3 - add x16, x3, #8 - add x17, x3, #12 - add x19, x3, #16 - add x20, x3, #20 - ld1r { v29.4s }, [x15], #4 - ld1r { v30.4s }, [x16] - add x16, x3, #24 - ld1r { v31.4s }, [x17] - add x17, x3, #28 - ld1r { v24.4s }, [x19] - ld1r { v18.4s }, [x20] - ld1r { v25.4s }, [x16] - ld1r { v8.4s }, [x15] - ld1r { v26.4s }, [x17] - cbz x2, .LBB3_2 + mov x16, x21 + add x17, x21, #8 + add x0, x21, #12 + add x1, x21, #16 + ld1r { v27.4s }, [x16], #4 + ld1r { v16.4s }, [x17] + add x17, x21, #20 + ld1r { v28.4s }, [x0] + add x0, x21, #24 + ld1r { v22.4s }, [x1] + ld1r { v7.4s }, [x17] + add x17, x21, #28 + ld1r { v11.4s }, [x0] + ld1r { v29.4s }, [x16] + ld1r { v13.4s }, [x17] + cbz x3, .LBB3_2 ldr q1, [sp, #16] - dup v0.4s, w4 - lsr x17, x4, #32 - mov x15, xzr - ldp x19, x20, [x0, #16] + dup v0.4s, w20 + lsr x2, x20, #32 + mov x16, xzr + ldp x17, x0, [x24] add v1.4s, v0.4s, v1.4s - mov x21, x2 + mov w22, w9 movi v0.4s, #128, lsl #24 - mov w26, w12 - str q1, [sp, #96] + str q1, [sp, #64] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w17 - ldp x16, x17, [x0] + dup v1.4s, w2 + ldp x1, x2, [x24, #16] sub v0.4s, v1.4s, v0.4s - str q0, [sp, #80] + str q0, [sp, #48] .LBB3_5: - add x23, x16, x15 - add x24, x17, x15 - add x22, x19, x15 - add x25, x20, x15 - subs x21, x21, #1 - add x15, x15, #64 - ldp q1, q2, [x23] - csel w27, w9, wzr, eq - orr w26, w27, w26 - and w26, w26, #0xff - ldp q4, q5, [x24] - dup v0.4s, w26 - mov w26, w6 - zip1 v22.4s, v1.4s, v4.4s - zip2 v20.4s, v1.4s, v4.4s - ldp q6, q7, [x22] - zip1 v17.4s, v2.4s, v5.4s - zip2 v23.4s, v2.4s, v5.4s - ldp q16, q21, [x25] - zip1 v19.4s, v6.4s, v16.4s - zip2 v1.4s, v6.4s, v16.4s - ldp q27, q28, [x23, #32] - zip1 v4.4s, v7.4s, v21.4s - zip2 v5.4s, v7.4s, v21.4s - zip2 v15.2d, v17.2d, v4.2d - ldp q9, q10, [x24, #32] - mov v17.d[1], v4.d[0] - add v4.4s, v30.4s, v25.4s - zip2 v11.2d, v23.2d, v5.2d - zip2 v3.4s, v27.4s, v9.4s - zip1 v7.4s, v27.4s, v9.4s - ldp q12, q6, [x22, #32] - mov v23.d[1], v5.d[0] - stp q11, q3, [sp, #256] - add v5.4s, v31.4s, v26.4s - add v4.4s, v4.4s, v17.4s - str q23, [sp, #352] - ldp q16, q2, [x25, #32] - add v5.4s, v5.4s, v23.4s - zip1 v3.4s, v12.4s, v16.4s - eor v0.16b, v5.16b, v0.16b - zip1 v9.4s, v6.4s, v2.4s - zip2 v2.4s, v6.4s, v2.4s - stp q7, q3, [sp, #208] - zip2 v3.4s, v12.4s, v16.4s - zip1 v12.4s, v28.4s, v10.4s - zip2 v10.4s, v28.4s, v10.4s - stp q17, q2, [sp, #160] - zip2 v28.2d, v22.2d, v19.2d - mov v22.d[1], v19.d[0] - str q3, [sp, #240] - add v2.4s, v8.4s, v18.4s - eor v16.16b, v4.16b, v13.16b - dup v17.4s, w13 - mov v3.16b, v22.16b - stp q22, q28, [sp, #320] - zip2 v22.2d, v20.2d, v1.2d - mov v20.d[1], v1.d[0] - add v1.4s, v29.4s, v24.4s - add v4.4s, v4.4s, v15.4s - add v5.4s, v5.4s, v11.4s - add v2.4s, v2.4s, v20.4s - stp q15, q20, [sp, #288] - add v1.4s, v1.4s, v3.4s - ldr q3, [sp, #96] - dup v20.4s, w14 - mov v23.16b, v22.16b - mov v15.16b, v10.16b - eor v6.16b, v1.16b, v3.16b - ldr q3, [sp, #80] - add v1.4s, v1.4s, v28.4s - ldr q28, [sp, #272] - str q23, [sp, #128] - eor v7.16b, v2.16b, v3.16b - ldp q27, q3, [sp, #32] - add v2.4s, v2.4s, v22.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v7.16b, { v7.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - tbl v0.16b, { v0.16b }, v27.16b - add v19.4s, v6.4s, v14.4s - add v21.4s, v7.4s, v3.4s - add v30.4s, v16.4s, v17.4s - add v31.4s, v0.4s, v20.4s - eor v24.16b, v19.16b, v24.16b - eor v17.16b, v21.16b, v18.16b - ushr v18.4s, v24.4s, #12 - shl v20.4s, v24.4s, #20 - eor v24.16b, v30.16b, v25.16b - eor v25.16b, v31.16b, v26.16b - ushr v26.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - ushr v29.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - ushr v8.4s, v25.4s, #12 - shl v25.4s, v25.4s, #20 - orr v3.16b, v20.16b, v18.16b - ldr q18, [x10, :lo12:.LCPI3_2] - orr v13.16b, v17.16b, v26.16b - orr v24.16b, v24.16b, v29.16b - orr v14.16b, v25.16b, v8.16b - add v8.4s, v1.4s, v3.4s - add v29.4s, v2.4s, v13.4s - add v17.4s, v4.4s, v24.4s - add v20.4s, v5.4s, v14.4s - eor v1.16b, v6.16b, v8.16b - eor v2.16b, v7.16b, v29.16b - eor v4.16b, v16.16b, v17.16b - eor v0.16b, v0.16b, v20.16b - tbl v25.16b, { v1.16b }, v18.16b - tbl v16.16b, { v2.16b }, v18.16b - tbl v6.16b, { v4.16b }, v18.16b - tbl v4.16b, { v0.16b }, v18.16b - add v19.4s, v19.4s, v25.4s - add v21.4s, v21.4s, v16.4s - add v26.4s, v30.4s, v6.4s - add v7.4s, v31.4s, v4.4s - eor v0.16b, v19.16b, v3.16b - eor v1.16b, v21.16b, v13.16b - eor v2.16b, v26.16b, v24.16b - eor v3.16b, v7.16b, v14.16b + add x25, x17, x16 + add x27, x0, x16 + add x4, x1, x16 + add x6, x2, x16 + subs x3, x3, #1 + add x16, x16, #64 + ldp q0, q1, [x25] + csel w28, w8, wzr, eq + orr w22, w28, w22 + and w22, w22, #0xff + ldp q2, q3, [x27] + dup v14.4s, w22 + mov w22, w19 + zip1 v20.4s, v0.4s, v2.4s + zip2 v18.4s, v0.4s, v2.4s + ldp q4, q5, [x4] + zip2 v21.4s, v1.4s, v3.4s + ldp q6, q25, [x6] + zip1 v17.4s, v4.4s, v6.4s + zip2 v15.4s, v4.4s, v6.4s + ldp q26, q8, [x25, #32] + zip1 v6.4s, v1.4s, v3.4s + zip1 v0.4s, v5.4s, v25.4s + zip2 v2.4s, v5.4s, v25.4s + zip2 v23.2d, v18.2d, v15.2d + mov v18.d[1], v15.d[0] + ldp q30, q12, [x27, #32] + zip2 v24.2d, v6.2d, v0.2d + mov v6.d[1], v0.d[0] + add v0.4s, v27.4s, v22.4s + zip1 v5.4s, v26.4s, v30.4s + zip2 v3.4s, v26.4s, v30.4s + stp q18, q24, [sp, #176] + ldp q9, q19, [x4, #32] + zip2 v30.2d, v20.2d, v17.2d + stur q6, [x29, #-224] + mov v20.d[1], v17.d[0] + stp q5, q3, [sp, #144] + zip1 v26.4s, v8.4s, v12.4s + zip2 v8.4s, v8.4s, v12.4s + stur q30, [x29, #-240] + ldp q4, q1, [x6, #32] + add v25.4s, v0.4s, v20.4s + stur q20, [x29, #-208] + zip2 v10.2d, v21.2d, v2.2d + mov v21.d[1], v2.d[0] + zip1 v5.4s, v9.4s, v4.4s + zip2 v3.4s, v9.4s, v4.4s + ldr q0, [sp, #64] + zip1 v31.4s, v19.4s, v1.4s + stur q21, [x29, #-192] + zip2 v12.4s, v19.4s, v1.4s + ldr q9, [x15, :lo12:.LCPI3_2] + add v1.4s, v29.4s, v7.4s + stp q5, q3, [sp, #224] + add v2.4s, v16.4s, v11.4s + ldr q16, [x10, :lo12:.LCPI3_1] + eor v4.16b, v25.16b, v0.16b + ldr q0, [sp, #48] + add v15.4s, v1.4s, v18.4s + add v3.4s, v28.4s, v13.4s + tbl v4.16b, { v4.16b }, v16.16b + eor v5.16b, v15.16b, v0.16b + add v3.4s, v3.4s, v21.4s + tbl v17.16b, { v5.16b }, v16.16b + dup v5.4s, w11 + add v2.4s, v2.4s, v6.4s + dup v19.4s, w12 + eor v6.16b, v3.16b, v14.16b + add v14.4s, v4.4s, v5.4s + movi v0.4s, #64 + eor v5.16b, v14.16b, v22.16b + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v17.4s, v19.4s + eor v18.16b, v2.16b, v0.16b + orr v29.16b, v5.16b, v20.16b + eor v5.16b, v22.16b, v7.16b + tbl v18.16b, { v18.16b }, v16.16b + ushr v7.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + tbl v27.16b, { v6.16b }, v16.16b + dup v6.4s, w14 + orr v28.16b, v5.16b, v7.16b + dup v5.4s, w13 + add v2.4s, v2.4s, v24.4s + add v5.4s, v18.4s, v5.4s + add v6.4s, v27.4s, v6.4s + eor v7.16b, v5.16b, v11.16b + ushr v20.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + eor v19.16b, v6.16b, v13.16b + ushr v21.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v0.16b, v7.16b, v20.16b + add v7.4s, v25.4s, v30.4s + add v20.4s, v15.4s, v23.4s + orr v1.16b, v19.16b, v21.16b + add v19.4s, v7.4s, v29.4s + ldr q7, [sp, #144] + add v20.4s, v20.4s, v28.4s + mov v11.16b, v23.16b + add v23.4s, v2.4s, v0.4s + str q11, [sp, #208] + eor v2.16b, v4.16b, v19.16b + tbl v21.16b, { v2.16b }, v9.16b + eor v2.16b, v17.16b, v20.16b + add v3.4s, v3.4s, v10.4s + tbl v25.16b, { v2.16b }, v9.16b + add v24.4s, v3.4s, v1.4s + eor v2.16b, v18.16b, v23.16b + add v14.4s, v14.4s, v21.4s + tbl v18.16b, { v2.16b }, v9.16b + eor v2.16b, v27.16b, v24.16b + add v15.4s, v22.4s, v25.4s + tbl v27.16b, { v2.16b }, v9.16b + eor v2.16b, v14.16b, v29.16b + eor v3.16b, v15.16b, v28.16b + ushr v4.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v22.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + orr v17.16b, v2.16b, v4.16b + add v2.4s, v5.4s, v18.4s + orr v4.16b, v3.16b, v22.16b + ldr q22, [sp, #224] + add v3.4s, v6.4s, v27.4s + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b ushr v5.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v24.4s, v1.4s, #7 + ushr v6.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v30.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v5.16b, v0.16b, v5.16b - orr v0.16b, v1.16b, v24.16b - ushr v31.4s, v3.4s, #7 - orr v2.16b, v2.16b, v30.16b - ldp q24, q30, [sp, #208] - shl v3.4s, v3.4s, #25 - zip2 v14.2d, v12.2d, v9.2d - mov v22.16b, v24.16b - orr v1.16b, v3.16b, v31.16b - zip2 v3.2d, v24.2d, v30.2d - mov v24.16b, v28.16b - mov v22.d[1], v30.d[0] - ldr q30, [sp, #240] - mov v31.16b, v12.16b - stp q22, q14, [sp, #224] - mov v24.d[1], v30.d[0] - add v12.4s, v8.4s, v22.4s - mov v31.d[1], v9.d[0] - add v22.4s, v29.4s, v24.4s - ldr q29, [sp, #176] - zip2 v28.2d, v28.2d, v30.2d - mov v9.16b, v24.16b - mov v15.d[1], v29.d[0] - zip2 v8.2d, v10.2d, v29.2d - add v10.4s, v12.4s, v0.4s - add v22.4s, v22.4s, v2.4s - str q9, [sp, #144] - add v20.4s, v20.4s, v15.4s - add v17.4s, v17.4s, v31.4s - stp q3, q8, [sp, #192] - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #12 + orr v0.16b, v0.16b, v5.16b + mov v5.16b, v7.16b + zip2 v13.2d, v7.2d, v22.2d + ldr q7, [sp, #160] + orr v1.16b, v1.16b, v6.16b + mov v6.16b, v5.16b + ldr q5, [sp, #240] + mov v30.16b, v7.16b + mov v6.d[1], v22.d[0] + str q13, [sp, #80] + mov v30.d[1], v5.d[0] + zip2 v28.2d, v8.2d, v12.2d + zip2 v7.2d, v7.2d, v5.2d + mov v5.16b, v8.16b + mov v29.16b, v8.16b + stp q6, q28, [sp, #224] + add v5.4s, v19.4s, v6.4s + add v6.4s, v20.4s, v30.4s + mov v29.d[1], v12.d[0] + add v6.4s, v6.4s, v0.4s + mov v22.16b, v26.16b + eor v21.16b, v21.16b, v6.16b + add v20.4s, v24.4s, v29.4s + stp q29, q30, [sp, #144] + tbl v21.16b, { v21.16b }, v16.16b + add v20.4s, v20.4s, v17.4s + mov v22.d[1], v31.d[0] + eor v18.16b, v18.16b, v20.16b + add v3.4s, v3.4s, v21.4s + add v5.4s, v5.4s, v4.4s + tbl v18.16b, { v18.16b }, v16.16b + str q22, [sp, #96] + eor v0.16b, v3.16b, v0.16b + add v19.4s, v23.4s, v22.4s + eor v23.16b, v27.16b, v5.16b + ushr v27.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b + tbl v23.16b, { v23.16b }, v16.16b + orr v0.16b, v0.16b, v27.16b + add v27.4s, v15.4s, v18.4s + add v6.4s, v6.4s, v7.4s + eor v17.16b, v27.16b, v17.16b + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v1.4s + add v20.4s, v20.4s, v28.4s + ldur q28, [x29, #-192] + add v6.4s, v6.4s, v0.4s + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v25.16b, v19.16b + add v2.4s, v2.4s, v23.4s + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + eor v18.16b, v18.16b, v20.16b + ushr v25.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + tbl v18.16b, { v18.16b }, v9.16b + zip2 v26.2d, v26.2d, v31.2d + orr v4.16b, v4.16b, v25.16b + add v25.4s, v14.4s, v24.4s + ldr q14, [sp, #176] + add v3.4s, v3.4s, v21.4s + add v5.4s, v5.4s, v13.4s + eor v1.16b, v25.16b, v1.16b + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + ushr v8.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v12.4s, v0.4s, #7 + add v5.4s, v5.4s, v4.4s + shl v0.4s, v0.4s, #25 + add v19.4s, v19.4s, v26.4s + eor v17.16b, v27.16b, v17.16b + orr v1.16b, v1.16b, v8.16b + eor v23.16b, v23.16b, v5.16b orr v0.16b, v0.16b, v12.16b - add v10.4s, v10.4s, v3.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v22.4s, v22.4s, v28.4s + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v19.4s, v19.4s, v1.4s + tbl v23.16b, { v23.16b }, v9.16b + add v5.4s, v5.4s, v14.4s + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v19.4s, v19.4s, v10.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v9.16b + add v2.4s, v2.4s, v23.4s + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v21.16b, { v21.16b }, v16.16b + eor v4.16b, v2.16b, v4.16b + eor v18.16b, v18.16b, v19.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v25.4s, v25.4s, v24.4s + tbl v18.16b, { v18.16b }, v16.16b + add v6.4s, v6.4s, v11.4s + orr v4.16b, v4.16b, v8.16b + eor v1.16b, v25.16b, v1.16b + add v25.4s, v25.4s, v21.4s + mov v31.16b, v26.16b + ushr v8.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v6.4s, v6.4s, v4.4s + eor v17.16b, v17.16b, v25.16b + add v2.4s, v2.4s, v18.4s + orr v1.16b, v1.16b, v8.16b + eor v24.16b, v24.16b, v6.16b + ushr v8.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v6.4s, v6.4s, v30.4s + ldp q26, q30, [x29, #-224] + add v20.4s, v20.4s, v26.4s + eor v0.16b, v2.16b, v0.16b + add v5.4s, v5.4s, v28.4s + ldr q28, [sp, #240] + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v20.4s, v20.4s, v1.4s + tbl v24.16b, { v24.16b }, v16.16b + add v19.4s, v19.4s, v30.4s + ldur q30, [x29, #-240] + add v5.4s, v5.4s, v17.4s + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + add v27.4s, v27.4s, v24.4s + eor v18.16b, v18.16b, v19.16b + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v27.16b, v4.16b + ushr v12.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v3.4s, v3.4s, v23.4s + add v25.4s, v25.4s, v21.4s + orr v4.16b, v4.16b, v12.16b + eor v1.16b, v3.16b, v1.16b + eor v17.16b, v25.16b, v17.16b + add v2.4s, v2.4s, v18.4s ushr v12.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v13.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v8.4s + ushr v8.4s, v17.4s, #7 + add v6.4s, v6.4s, v4.4s + shl v17.4s, v17.4s, #25 + add v20.4s, v20.4s, v31.4s + eor v0.16b, v2.16b, v0.16b orr v1.16b, v1.16b, v12.16b - add v17.4s, v17.4s, v14.4s - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v18.16b - tbl v25.16b, { v25.16b }, v18.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #7 + eor v24.16b, v24.16b, v6.16b + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v22.4s, v22.4s, v23.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v17.4s, v17.4s, v11.4s - mov v30.16b, v28.16b - mov v28.16b, v23.16b - ldr q23, [sp, #304] + add v20.4s, v20.4s, v1.4s + tbl v24.16b, { v24.16b }, v9.16b + add v6.4s, v6.4s, v22.4s + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v20.4s, v20.4s, v28.4s + ldr q28, [sp, #160] + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v9.16b + add v27.4s, v27.4s, v24.4s + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v21.16b, { v21.16b }, v16.16b + eor v4.16b, v27.16b, v4.16b + eor v18.16b, v18.16b, v20.16b + ushr v12.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v3.4s, v3.4s, v23.4s + tbl v18.16b, { v18.16b }, v16.16b + add v5.4s, v5.4s, v30.4s + ldr q30, [sp, #192] + orr v4.16b, v4.16b, v12.16b + eor v1.16b, v3.16b, v1.16b + add v3.4s, v3.4s, v21.4s ushr v12.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v22.4s, v22.4s, v0.4s - mov v29.16b, v31.16b - ldr q31, [sp, #160] - orr v5.16b, v5.16b, v13.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v23.4s + add v5.4s, v5.4s, v4.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + mov v15.16b, v7.16b orr v1.16b, v1.16b, v12.16b - str q29, [sp, #272] - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v31.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v27.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v27.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v27.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v27.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s + eor v23.16b, v23.16b, v5.16b ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v5.16b, v19.16b - add v22.4s, v22.4s, v24.4s - ldr q24, [sp, #320] + add v5.4s, v5.4s, v7.4s + ldr q7, [sp, #224] + add v19.4s, v19.4s, v13.4s + eor v17.16b, v27.16b, v17.16b + add v6.4s, v6.4s, v30.4s orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v17.4s, v24.4s - ldr q24, [sp, #352] - ushr v13.4s, v1.4s, #12 + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v1.4s + add v20.4s, v20.4s, v7.4s + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v16.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v20.16b + add v2.4s, v2.4s, v23.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + add v3.4s, v3.4s, v21.4s + ushr v8.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v25.16b, v1.16b + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v24.4s - ldr q24, [sp, #336] - orr v1.16b, v1.16b, v13.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v14.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v18.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v18.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v18.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v18.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v19.16b, v5.16b + add v19.4s, v19.4s, v29.4s + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + orr v1.16b, v1.16b, v8.16b orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - add v10.4s, v10.4s, v24.4s - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v22.4s, v22.4s, v29.4s - ushr v13.4s, v1.4s, #7 + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v5.4s, v5.4s, v11.4s + tbl v23.16b, { v23.16b }, v9.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v19.4s, v19.4s, v31.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v9.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + add v2.4s, v2.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v19.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v25.16b, v1.16b + add v25.4s, v25.4s, v21.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v8.4s - ldr q8, [sp, #288] - orr v1.16b, v1.16b, v13.16b - add v17.4s, v17.4s, v3.4s - ldr q3, [sp, #352] - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #12 + add v6.4s, v6.4s, v28.4s + eor v17.16b, v17.16b, v25.16b + add v2.4s, v2.4s, v18.4s + orr v1.16b, v1.16b, v8.16b + ushr v8.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v20.4s, v20.4s, v10.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b + add v5.4s, v5.4s, v26.4s + ldur q26, [x29, #-192] + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v10.4s, v10.4s, v30.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v22.4s, v22.4s, v8.4s - mov v24.16b, v30.16b - mov v30.16b, v15.16b - add v17.4s, v17.4s, v15.4s - ldr q15, [sp, #224] + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v19.4s, v19.4s, v14.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v16.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v19.16b + add v27.4s, v27.4s, v24.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + add v25.4s, v25.4s, v21.4s + ushr v12.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v3.16b, v1.16b + add v6.4s, v6.4s, v22.4s + eor v17.16b, v25.16b, v17.16b + add v2.4s, v2.4s, v18.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - str q30, [sp, #176] - orr v5.16b, v5.16b, v13.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v15.4s + ushr v8.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v20.4s, v20.4s, v29.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b orr v1.16b, v1.16b, v12.16b - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v18.16b - tbl v25.16b, { v25.16b }, v18.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #7 + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v22.4s, v22.4s, v9.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v17.4s, v17.4s, v14.4s + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v6.4s, v6.4s, v13.4s + tbl v24.16b, { v24.16b }, v9.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v20.4s, v20.4s, v7.4s + ldur q7, [x29, #-240] + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v9.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + add v27.4s, v27.4s, v24.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v20.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v12.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v3.16b, v1.16b + add v3.4s, v3.4s, v21.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v13.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v28.4s + add v5.4s, v5.4s, v26.4s + ldur q26, [x29, #-208] + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s orr v1.16b, v1.16b, v12.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v11.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v27.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v27.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v27.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v27.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v5.16b, v19.16b + add v19.4s, v19.4s, v15.4s + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + add v6.4s, v6.4s, v26.4s orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - add v22.4s, v22.4s, v29.4s - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v17.4s, v23.4s - ushr v13.4s, v1.4s, #12 + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v20.4s, v20.4s, v7.4s + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v16.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v20.16b + add v2.4s, v2.4s, v23.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + add v3.4s, v3.4s, v21.4s + mov v11.16b, v10.16b + ldr q10, [sp, #240] + ushr v8.4s, v4.4s, #12 + str q11, [sp, #112] + shl v4.4s, v4.4s, #20 + eor v1.16b, v25.16b, v1.16b + add v5.4s, v5.4s, v30.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v31.4s - orr v1.16b, v1.16b, v13.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v30.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v18.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v18.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v18.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v18.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v19.16b, v5.16b - add v10.4s, v10.4s, v3.4s - ldr q3, [sp, #192] + add v19.4s, v19.4s, v10.4s + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + orr v1.16b, v1.16b, v8.16b orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v22.4s, v22.4s, v3.4s - ushr v13.4s, v1.4s, #7 + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v5.4s, v5.4s, v28.4s + ldr q28, [sp, #208] + tbl v23.16b, { v23.16b }, v9.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v19.4s, v19.4s, v29.4s + ldr q29, [sp, #176] + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v9.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + add v2.4s, v2.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v19.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v25.16b, v1.16b + add v25.4s, v25.4s, v21.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v15.4s - ldr q15, [sp, #128] - orr v1.16b, v1.16b, v13.16b - add v17.4s, v17.4s, v24.4s - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #12 + add v6.4s, v6.4s, v22.4s + eor v17.16b, v17.16b, v25.16b + add v2.4s, v2.4s, v18.4s + orr v1.16b, v1.16b, v8.16b + ushr v8.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v20.4s, v20.4s, v31.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b + add v5.4s, v5.4s, v11.4s + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v21.16b, v5.16b - ldp q23, q11, [sp, #320] - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v10.4s, v10.4s, v8.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v22.4s, v22.4s, v23.4s + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v19.4s, v19.4s, v28.4s + ldur q28, [x29, #-224] + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v16.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v19.16b + add v27.4s, v27.4s, v24.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + add v25.4s, v25.4s, v21.4s + ushr v12.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v3.16b, v1.16b + add v6.4s, v6.4s, v13.4s + eor v17.16b, v25.16b, v17.16b + add v2.4s, v2.4s, v18.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - mov v28.16b, v31.16b - mov v31.16b, v8.16b - ldr q8, [sp, #208] - orr v5.16b, v5.16b, v13.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v11.4s + ushr v8.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v20.4s, v20.4s, v10.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b orr v1.16b, v1.16b, v12.16b - add v17.4s, v17.4s, v8.4s - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v18.16b - tbl v25.16b, { v25.16b }, v18.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #7 + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v22.4s, v22.4s, v29.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v17.4s, v17.4s, v30.4s + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v6.4s, v6.4s, v15.4s + tbl v24.16b, { v24.16b }, v9.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v20.4s, v20.4s, v7.4s + ldr q7, [sp, #192] + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v9.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + add v27.4s, v27.4s, v24.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v20.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v12.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v3.16b, v1.16b + add v3.4s, v3.4s, v21.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v13.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v9.4s + add v5.4s, v5.4s, v28.4s + ldr q28, [sp, #224] + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + mov v14.16b, v31.16b orr v1.16b, v1.16b, v12.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v14.4s - ldr q14, [sp, #256] - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v27.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v27.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v27.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v27.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s + str q14, [sp, #128] + mov v31.16b, v15.16b + ldur q15, [x29, #-192] ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v5.16b, v19.16b + add v19.4s, v19.4s, v30.4s + ldr q30, [sp, #144] + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + add v6.4s, v6.4s, v29.4s orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - add v22.4s, v22.4s, v3.4s - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v17.4s, v15.4s - ushr v13.4s, v1.4s, #12 + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v20.4s, v20.4s, v15.4s + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v16.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v20.16b + add v2.4s, v2.4s, v23.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + add v3.4s, v3.4s, v21.4s + ushr v8.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v25.16b, v1.16b + add v5.4s, v5.4s, v26.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v14.4s - orr v1.16b, v1.16b, v13.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v8.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v18.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v18.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v18.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v18.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v19.16b, v5.16b + add v19.4s, v19.4s, v28.4s + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + orr v1.16b, v1.16b, v8.16b orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - add v10.4s, v10.4s, v28.4s - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v22.4s, v22.4s, v24.4s - ushr v13.4s, v1.4s, #7 + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v5.4s, v5.4s, v22.4s + tbl v23.16b, { v23.16b }, v9.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v19.4s, v19.4s, v10.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v9.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + add v2.4s, v2.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v19.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v25.16b, v1.16b + add v25.4s, v25.4s, v21.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v11.4s - ldr q11, [sp, #304] - orr v1.16b, v1.16b, v13.16b - add v17.4s, v17.4s, v31.4s - ldr q31, [sp, #224] - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #12 + add v6.4s, v6.4s, v13.4s + eor v17.16b, v17.16b, v25.16b + add v2.4s, v2.4s, v18.4s + orr v1.16b, v1.16b, v8.16b + ushr v8.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v5.4s, v5.4s, v14.4s + mov v13.16b, v14.16b + ldr q14, [sp, #160] + add v20.4s, v20.4s, v30.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v10.4s, v10.4s, v23.4s - ldr q23, [sp, #240] - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v22.4s, v22.4s, v11.4s - mov v30.16b, v8.16b - mov v8.16b, v24.16b - ldr q24, [sp, #352] + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v19.4s, v19.4s, v14.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v16.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v19.16b + add v27.4s, v27.4s, v24.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + add v25.4s, v25.4s, v21.4s + ushr v12.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v3.16b, v1.16b + add v6.4s, v6.4s, v31.4s + eor v17.16b, v25.16b, v17.16b + add v2.4s, v2.4s, v18.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v13.16b - str q8, [sp, #112] - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v24.4s + ushr v8.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v20.4s, v20.4s, v28.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b orr v1.16b, v1.16b, v12.16b - add v17.4s, v17.4s, v31.4s - eor v4.16b, v4.16b, v10.16b - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v18.16b - tbl v25.16b, { v25.16b }, v18.16b - eor v6.16b, v6.16b, v20.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - mov v29.16b, v3.16b - add v19.4s, v19.4s, v16.4s - ushr v12.4s, v0.4s, #7 + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v21.16b, v5.16b - eor v1.16b, v19.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - add v22.4s, v22.4s, v29.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v17.4s, v17.4s, v30.4s - ldr q30, [sp, #272] + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v6.4s, v6.4s, v7.4s + tbl v24.16b, { v24.16b }, v9.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v20.4s, v20.4s, v15.4s + ldr q15, [sp, #80] + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v9.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + add v27.4s, v27.4s, v24.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v20.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v12.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v3.16b, v1.16b + add v5.4s, v5.4s, v11.4s + add v3.4s, v3.4s, v21.4s + ldp q22, q11, [x29, #-240] + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v22.4s, v22.4s, v0.4s - mov v3.16b, v28.16b - ldr q28, [sp, #176] - orr v5.16b, v5.16b, v13.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v30.4s + add v19.4s, v19.4s, v26.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + ldr q26, [sp, #208] orr v1.16b, v1.16b, v12.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v28.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v27.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v27.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v27.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v27.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v5.16b, v19.16b + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + add v6.4s, v6.4s, v26.4s orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - add v22.4s, v22.4s, v8.4s - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v17.4s, v9.4s - ldr q9, [sp, #320] - ushr v13.4s, v1.4s, #12 + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v20.4s, v20.4s, v11.4s + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v16.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v20.16b + add v2.4s, v2.4s, v23.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + add v3.4s, v3.4s, v21.4s + ushr v8.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v25.16b, v1.16b + add v5.4s, v5.4s, v29.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v22.4s, v22.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v17.4s, v17.4s, v2.4s - add v10.4s, v10.4s, v23.4s - orr v1.16b, v1.16b, v13.16b - eor v16.16b, v16.16b, v22.16b - add v20.4s, v20.4s, v31.4s - eor v6.16b, v6.16b, v17.16b - add v10.4s, v10.4s, v5.4s - tbl v16.16b, { v16.16b }, v18.16b - add v20.4s, v20.4s, v1.4s - tbl v6.16b, { v6.16b }, v18.16b - eor v25.16b, v25.16b, v10.16b - add v21.4s, v21.4s, v16.4s - eor v4.16b, v4.16b, v20.16b - add v26.4s, v26.4s, v6.4s - tbl v25.16b, { v25.16b }, v18.16b - eor v0.16b, v21.16b, v0.16b - tbl v4.16b, { v4.16b }, v18.16b - eor v2.16b, v26.16b, v2.16b - add v19.4s, v19.4s, v25.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - add v7.4s, v7.4s, v4.4s - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v5.16b, v19.16b, v5.16b - add v10.4s, v10.4s, v14.4s - ldr q14, [sp, #288] + add v19.4s, v19.4s, v22.4s + add v5.4s, v5.4s, v4.4s + eor v17.16b, v27.16b, v17.16b + orr v1.16b, v1.16b, v8.16b orr v0.16b, v0.16b, v12.16b - eor v1.16b, v7.16b, v1.16b - orr v2.16b, v2.16b, v13.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v22.4s, v22.4s, v14.4s - ushr v13.4s, v1.4s, #7 + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b + add v5.4s, v5.4s, v15.4s + tbl v23.16b, { v23.16b }, v9.16b + orr v17.16b, v17.16b, v12.16b + eor v24.16b, v24.16b, v19.16b + add v19.4s, v19.4s, v28.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v9.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + add v2.4s, v2.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v19.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v25.16b, v1.16b + add v25.4s, v25.4s, v21.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v12.16b - add v22.4s, v22.4s, v2.4s - add v20.4s, v20.4s, v24.4s - orr v1.16b, v1.16b, v13.16b - eor v4.16b, v4.16b, v10.16b - add v17.4s, v17.4s, v9.4s - eor v25.16b, v25.16b, v22.16b - add v20.4s, v20.4s, v5.4s - tbl v4.16b, { v4.16b }, v27.16b - add v17.4s, v17.4s, v1.4s - tbl v25.16b, { v25.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - add v26.4s, v26.4s, v4.4s - eor v16.16b, v16.16b, v17.16b - add v7.4s, v7.4s, v25.4s - tbl v6.16b, { v6.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v2.16b, v7.16b, v2.16b - add v21.4s, v21.4s, v6.4s - ushr v12.4s, v0.4s, #12 + add v6.4s, v6.4s, v31.4s + eor v17.16b, v17.16b, v25.16b + add v2.4s, v2.4s, v18.4s + orr v1.16b, v1.16b, v8.16b + add v20.4s, v20.4s, v10.4s + ldr q10, [sp, #96] + ushr v8.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b + add v5.4s, v5.4s, v30.4s + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - add v19.4s, v19.4s, v16.4s - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v5.16b, v21.16b, v5.16b - orr v0.16b, v0.16b, v12.16b - eor v1.16b, v19.16b, v1.16b - add v10.4s, v10.4s, v11.4s - orr v2.16b, v2.16b, v13.16b - ushr v13.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v19.4s, v19.4s, v10.4s + add v5.4s, v5.4s, v17.4s + tbl v24.16b, { v24.16b }, v16.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v18.16b, v18.16b, v19.16b + add v27.4s, v27.4s, v24.4s + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + add v25.4s, v25.4s, v21.4s + ushr v12.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v3.16b, v1.16b + add v6.4s, v6.4s, v7.4s + eor v17.16b, v25.16b, v17.16b + add v2.4s, v2.4s, v18.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - add v22.4s, v22.4s, v15.4s - orr v5.16b, v5.16b, v13.16b - add v20.4s, v20.4s, v3.4s - mov v24.16b, v3.16b - ldr q3, [sp, #336] + ushr v8.4s, v17.4s, #7 + mov v30.16b, v7.16b + shl v17.4s, v17.4s, #25 + ldur q7, [x29, #-208] + add v20.4s, v20.4s, v22.4s + add v6.4s, v6.4s, v4.4s + eor v0.16b, v2.16b, v0.16b orr v1.16b, v1.16b, v12.16b - eor v4.16b, v4.16b, v10.16b - add v22.4s, v22.4s, v2.4s - add v17.4s, v17.4s, v3.4s - add v20.4s, v20.4s, v5.4s - tbl v4.16b, { v4.16b }, v18.16b - eor v25.16b, v25.16b, v22.16b - add v17.4s, v17.4s, v1.4s - eor v6.16b, v6.16b, v20.16b - add v26.4s, v26.4s, v4.4s - tbl v25.16b, { v25.16b }, v18.16b - eor v16.16b, v16.16b, v17.16b - tbl v6.16b, { v6.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - add v7.4s, v7.4s, v25.4s - tbl v16.16b, { v16.16b }, v18.16b - add v21.4s, v21.4s, v6.4s - ushr v12.4s, v0.4s, #7 + orr v17.16b, v17.16b, v8.16b + ushr v8.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v2.16b, v7.16b, v2.16b - add v19.4s, v19.4s, v16.4s - eor v5.16b, v21.16b, v5.16b - orr v0.16b, v0.16b, v12.16b - ushr v12.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v1.16b, v19.16b, v1.16b - ushr v13.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v22.4s, v22.4s, v8.4s - orr v2.16b, v2.16b, v12.16b + add v20.4s, v20.4s, v1.4s + eor v24.16b, v24.16b, v6.16b + add v6.4s, v6.4s, v7.4s + tbl v24.16b, { v24.16b }, v9.16b + orr v0.16b, v0.16b, v8.16b + eor v23.16b, v23.16b, v20.16b + add v20.4s, v20.4s, v11.4s + add v6.4s, v6.4s, v0.4s + tbl v23.16b, { v23.16b }, v9.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + add v27.4s, v27.4s, v24.4s + tbl v21.16b, { v21.16b }, v16.16b + eor v18.16b, v18.16b, v20.16b + eor v4.16b, v27.16b, v4.16b + add v3.4s, v3.4s, v23.4s + tbl v18.16b, { v18.16b }, v16.16b + ushr v12.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + eor v1.16b, v3.16b, v1.16b + add v3.4s, v3.4s, v21.4s + orr v4.16b, v4.16b, v12.16b ushr v12.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - orr v5.16b, v5.16b, v13.16b - add v22.4s, v22.4s, v0.4s - add v10.4s, v10.4s, v29.4s - ldr q29, [sp, #208] - add v17.4s, v17.4s, v31.4s + add v5.4s, v5.4s, v13.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + mov v28.16b, v22.16b + ldr q22, [sp, #112] orr v1.16b, v1.16b, v12.16b - add v20.4s, v20.4s, v29.4s - eor v16.16b, v16.16b, v22.16b - add v10.4s, v10.4s, v5.4s - add v17.4s, v17.4s, v2.4s - add v20.4s, v20.4s, v1.4s - tbl v16.16b, { v16.16b }, v27.16b - eor v25.16b, v25.16b, v10.16b - eor v6.16b, v6.16b, v17.16b - eor v4.16b, v4.16b, v20.16b - add v21.4s, v21.4s, v16.4s - tbl v25.16b, { v25.16b }, v27.16b - tbl v6.16b, { v6.16b }, v27.16b - tbl v4.16b, { v4.16b }, v27.16b - eor v0.16b, v21.16b, v0.16b - add v19.4s, v19.4s, v25.4s - add v26.4s, v26.4s, v6.4s - add v7.4s, v7.4s, v4.4s - ushr v12.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v5.16b, v5.16b, v19.16b - eor v2.16b, v26.16b, v2.16b - eor v1.16b, v7.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - ushr v12.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v22.4s, v22.4s, v14.4s - mov v8.16b, v31.16b - ushr v13.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - mov v31.16b, v14.16b - ushr v14.4s, v1.4s, #12 - shl v1.4s, v1.4s, #20 - orr v5.16b, v5.16b, v12.16b - add v22.4s, v22.4s, v0.4s - add v10.4s, v10.4s, v28.4s - ldr q28, [sp, #352] - orr v2.16b, v2.16b, v13.16b - orr v1.16b, v1.16b, v14.16b - add v17.4s, v17.4s, v30.4s - add v20.4s, v20.4s, v3.4s - eor v16.16b, v16.16b, v22.16b - add v10.4s, v10.4s, v5.4s - add v17.4s, v17.4s, v2.4s - add v20.4s, v20.4s, v1.4s - tbl v16.16b, { v16.16b }, v18.16b - eor v25.16b, v25.16b, v10.16b - eor v6.16b, v6.16b, v17.16b - eor v4.16b, v4.16b, v20.16b - add v21.4s, v21.4s, v16.4s - tbl v25.16b, { v25.16b }, v18.16b - tbl v6.16b, { v6.16b }, v18.16b - tbl v4.16b, { v4.16b }, v18.16b - eor v0.16b, v21.16b, v0.16b - add v19.4s, v19.4s, v25.4s - add v26.4s, v26.4s, v6.4s - add v7.4s, v7.4s, v4.4s - ushr v12.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v5.16b, v19.16b, v5.16b - eor v2.16b, v26.16b, v2.16b - eor v1.16b, v7.16b, v1.16b - orr v0.16b, v0.16b, v12.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - add v10.4s, v10.4s, v23.4s - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ushr v14.4s, v1.4s, #7 - shl v1.4s, v1.4s, #25 - orr v5.16b, v5.16b, v12.16b - add v10.4s, v10.4s, v0.4s - add v20.4s, v20.4s, v24.4s - ldr q24, [sp, #144] - orr v2.16b, v2.16b, v13.16b - orr v1.16b, v1.16b, v14.16b - add v22.4s, v22.4s, v9.4s - add v17.4s, v17.4s, v11.4s - eor v4.16b, v4.16b, v10.16b - add v20.4s, v20.4s, v5.4s - add v22.4s, v22.4s, v2.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v27.16b - eor v6.16b, v6.16b, v20.16b - eor v25.16b, v25.16b, v22.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - tbl v6.16b, { v6.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - eor v0.16b, v26.16b, v0.16b - add v21.4s, v21.4s, v6.4s - add v7.4s, v7.4s, v25.4s - add v19.4s, v19.4s, v16.4s + add v19.4s, v19.4s, v29.4s + add v5.4s, v5.4s, v4.4s ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v5.16b, v21.16b, v5.16b - eor v2.16b, v7.16b, v2.16b - eor v1.16b, v19.16b, v1.16b + eor v17.16b, v27.16b, v17.16b + add v6.4s, v6.4s, v14.4s + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b orr v0.16b, v0.16b, v12.16b - add v10.4s, v10.4s, v15.4s - ushr v14.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - mov v30.16b, v3.16b - ldr q3, [sp, #256] - ushr v12.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ushr v13.4s, v1.4s, #12 + ushr v12.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + tbl v23.16b, { v23.16b }, v16.16b + add v20.4s, v20.4s, v22.4s + add v6.4s, v6.4s, v0.4s + eor v24.16b, v24.16b, v19.16b + orr v17.16b, v17.16b, v12.16b + tbl v24.16b, { v24.16b }, v16.16b + add v20.4s, v20.4s, v17.4s + eor v21.16b, v21.16b, v6.16b + tbl v21.16b, { v21.16b }, v9.16b + add v2.4s, v2.4s, v23.4s + eor v18.16b, v18.16b, v20.16b + tbl v18.16b, { v18.16b }, v9.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + mov v13.16b, v11.16b + ushr v8.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + mov v11.16b, v26.16b + add v5.4s, v5.4s, v26.4s + ldur q26, [x29, #-192] + add v3.4s, v3.4s, v21.4s + eor v1.16b, v25.16b, v1.16b + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - add v10.4s, v10.4s, v0.4s - orr v5.16b, v5.16b, v14.16b - add v20.4s, v20.4s, v3.4s - orr v2.16b, v2.16b, v12.16b - orr v1.16b, v1.16b, v13.16b - add v22.4s, v22.4s, v24.4s - add v17.4s, v17.4s, v28.4s - eor v4.16b, v4.16b, v10.16b - add v20.4s, v20.4s, v5.4s - add v22.4s, v22.4s, v2.4s - add v17.4s, v17.4s, v1.4s - tbl v4.16b, { v4.16b }, v18.16b - eor v6.16b, v6.16b, v20.16b - eor v25.16b, v25.16b, v22.16b - eor v16.16b, v16.16b, v17.16b - add v26.4s, v26.4s, v4.4s - tbl v6.16b, { v6.16b }, v18.16b - tbl v25.16b, { v25.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - eor v0.16b, v26.16b, v0.16b - add v21.4s, v21.4s, v6.4s - add v7.4s, v7.4s, v25.4s - add v19.4s, v19.4s, v16.4s + eor v0.16b, v3.16b, v0.16b + add v27.4s, v27.4s, v18.4s + add v19.4s, v19.4s, v26.4s + add v5.4s, v5.4s, v4.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v5.16b, v21.16b, v5.16b - eor v2.16b, v7.16b, v2.16b - eor v1.16b, v19.16b, v1.16b + orr v1.16b, v1.16b, v8.16b + eor v17.16b, v27.16b, v17.16b + add v19.4s, v19.4s, v1.4s + eor v23.16b, v23.16b, v5.16b orr v0.16b, v0.16b, v12.16b - ushr v12.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - mov v23.16b, v9.16b - ldr q9, [sp, #112] - ushr v13.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ushr v14.4s, v1.4s, #7 + ushr v12.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + tbl v23.16b, { v23.16b }, v9.16b + add v5.4s, v5.4s, v31.4s + eor v24.16b, v24.16b, v19.16b + orr v17.16b, v17.16b, v12.16b + tbl v24.16b, { v24.16b }, v9.16b + add v19.4s, v19.4s, v28.4s + ldr q28, [sp, #224] + add v5.4s, v5.4s, v17.4s + add v2.4s, v2.4s, v23.4s + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + tbl v21.16b, { v21.16b }, v16.16b + eor v4.16b, v2.16b, v4.16b + add v25.4s, v25.4s, v24.4s + eor v18.16b, v18.16b, v19.16b + ushr v8.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + tbl v18.16b, { v18.16b }, v16.16b + eor v1.16b, v25.16b, v1.16b + add v6.4s, v6.4s, v30.4s + orr v4.16b, v4.16b, v8.16b + ushr v8.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - orr v5.16b, v5.16b, v12.16b - add v9.4s, v10.4s, v9.4s - orr v2.16b, v2.16b, v13.16b - orr v1.16b, v1.16b, v14.16b - ldr q14, [sp, #64] - add v22.4s, v22.4s, v31.4s - add v17.4s, v17.4s, v30.4s - add v20.4s, v20.4s, v8.4s - add v9.4s, v9.4s, v5.4s - add v22.4s, v22.4s, v0.4s - add v17.4s, v17.4s, v2.4s + add v25.4s, v25.4s, v21.4s + add v20.4s, v20.4s, v28.4s + add v6.4s, v6.4s, v4.4s + orr v1.16b, v1.16b, v8.16b + add v2.4s, v2.4s, v18.4s + eor v17.16b, v17.16b, v25.16b add v20.4s, v20.4s, v1.4s - eor v25.16b, v25.16b, v9.16b - eor v16.16b, v16.16b, v22.16b - eor v6.16b, v6.16b, v17.16b - eor v4.16b, v4.16b, v20.16b - tbl v25.16b, { v25.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - tbl v6.16b, { v6.16b }, v27.16b - tbl v4.16b, { v4.16b }, v27.16b - add v19.4s, v19.4s, v25.4s - add v21.4s, v21.4s, v16.4s - add v26.4s, v26.4s, v6.4s - add v7.4s, v7.4s, v4.4s - eor v5.16b, v5.16b, v19.16b - eor v0.16b, v21.16b, v0.16b - eor v2.16b, v26.16b, v2.16b - eor v1.16b, v7.16b, v1.16b - ushr v30.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ushr v10.4s, v0.4s, #12 + eor v24.16b, v24.16b, v6.16b + ushr v28.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + tbl v24.16b, { v24.16b }, v16.16b + eor v0.16b, v2.16b, v0.16b + eor v23.16b, v23.16b, v20.16b + orr v17.16b, v17.16b, v28.16b + ushr v28.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v12.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ushr v13.4s, v1.4s, #12 + tbl v23.16b, { v23.16b }, v16.16b + orr v0.16b, v0.16b, v28.16b + ldr q28, [sp, #240] + add v27.4s, v27.4s, v24.4s + add v6.4s, v6.4s, v7.4s + ldr q7, [sp, #144] + add v3.4s, v3.4s, v23.4s + eor v4.16b, v27.16b, v4.16b + add v5.4s, v5.4s, v28.4s + ushr v31.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v1.16b, v3.16b, v1.16b + add v19.4s, v19.4s, v15.4s + add v5.4s, v5.4s, v17.4s + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - orr v5.16b, v5.16b, v30.16b - add v30.4s, v9.4s, v29.4s - add v22.4s, v22.4s, v23.4s - ldr q23, [sp, #192] - orr v0.16b, v0.16b, v10.16b - orr v2.16b, v2.16b, v12.16b - orr v1.16b, v1.16b, v13.16b - add v17.4s, v17.4s, v23.4s - add v20.4s, v20.4s, v28.4s - add v23.4s, v30.4s, v5.4s - add v22.4s, v22.4s, v0.4s - add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v26.4s + add v6.4s, v6.4s, v4.4s + add v19.4s, v19.4s, v0.4s + eor v21.16b, v21.16b, v5.16b + orr v1.16b, v1.16b, v31.16b + tbl v21.16b, { v21.16b }, v9.16b add v20.4s, v20.4s, v1.4s - eor v25.16b, v25.16b, v23.16b - eor v16.16b, v16.16b, v22.16b - eor v6.16b, v6.16b, v17.16b - eor v4.16b, v4.16b, v20.16b - tbl v25.16b, { v25.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - tbl v6.16b, { v6.16b }, v18.16b - tbl v4.16b, { v4.16b }, v18.16b - add v19.4s, v19.4s, v25.4s - add v21.4s, v21.4s, v16.4s - add v26.4s, v26.4s, v6.4s - add v7.4s, v7.4s, v4.4s - eor v5.16b, v19.16b, v5.16b - eor v0.16b, v21.16b, v0.16b - eor v2.16b, v26.16b, v2.16b - eor v1.16b, v7.16b, v1.16b - ushr v28.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ushr v30.4s, v0.4s, #7 + eor v24.16b, v24.16b, v6.16b + eor v18.16b, v18.16b, v19.16b + tbl v24.16b, { v24.16b }, v9.16b + tbl v18.16b, { v18.16b }, v9.16b + eor v23.16b, v23.16b, v20.16b + tbl v23.16b, { v23.16b }, v9.16b + add v25.4s, v25.4s, v21.4s + add v26.4s, v27.4s, v24.4s + add v2.4s, v2.4s, v18.4s + eor v17.16b, v25.16b, v17.16b + ushr v27.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + add v3.4s, v3.4s, v23.4s + eor v4.16b, v26.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + ushr v28.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + orr v17.16b, v17.16b, v27.16b + ushr v27.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v31.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ushr v8.4s, v1.4s, #7 + eor v1.16b, v3.16b, v1.16b + add v5.4s, v5.4s, v7.4s + ldr q7, [sp, #128] + add v6.4s, v6.4s, v29.4s + prfm pldl1keep, [x25, #256] + orr v4.16b, v4.16b, v28.16b + prfm pldl1keep, [x27, #256] + ushr v28.4s, v1.4s, #7 + prfm pldl1keep, [x4, #256] shl v1.4s, v1.4s, #25 - orr v5.16b, v5.16b, v28.16b - ldr q28, [sp, #176] - orr v0.16b, v0.16b, v30.16b - orr v2.16b, v2.16b, v31.16b - orr v1.16b, v1.16b, v8.16b - add v23.4s, v23.4s, v28.4s - add v22.4s, v22.4s, v11.4s - add v17.4s, v17.4s, v15.4s - add v20.4s, v20.4s, v3.4s - ldr q3, [sp, #272] - add v23.4s, v23.4s, v0.4s - add v22.4s, v22.4s, v2.4s - add v17.4s, v17.4s, v1.4s - add v20.4s, v20.4s, v5.4s - eor v4.16b, v4.16b, v23.16b - eor v25.16b, v25.16b, v22.16b - eor v16.16b, v16.16b, v17.16b - eor v6.16b, v6.16b, v20.16b - tbl v4.16b, { v4.16b }, v27.16b - tbl v25.16b, { v25.16b }, v27.16b - tbl v16.16b, { v16.16b }, v27.16b - tbl v6.16b, { v6.16b }, v27.16b - add v26.4s, v26.4s, v4.4s - add v7.4s, v7.4s, v25.4s - add v19.4s, v19.4s, v16.4s - add v21.4s, v21.4s, v6.4s - eor v0.16b, v26.16b, v0.16b - eor v2.16b, v7.16b, v2.16b - eor v1.16b, v19.16b, v1.16b - eor v5.16b, v21.16b, v5.16b - add v3.4s, v22.4s, v3.4s - ldr q22, [sp, #160] - ushr v28.4s, v0.4s, #12 + prfm pldl1keep, [x6, #256] + orr v0.16b, v0.16b, v27.16b + add v19.4s, v19.4s, v11.4s + add v20.4s, v20.4s, v22.4s + add v5.4s, v5.4s, v4.4s + add v6.4s, v6.4s, v0.4s + orr v1.16b, v1.16b, v28.16b + add v19.4s, v19.4s, v1.4s + add v20.4s, v20.4s, v17.4s + eor v23.16b, v23.16b, v5.16b + eor v21.16b, v21.16b, v6.16b + tbl v23.16b, { v23.16b }, v16.16b + tbl v21.16b, { v21.16b }, v16.16b + eor v24.16b, v24.16b, v19.16b + eor v18.16b, v18.16b, v20.16b + tbl v24.16b, { v24.16b }, v16.16b + tbl v16.16b, { v18.16b }, v16.16b + add v2.4s, v2.4s, v23.4s + add v3.4s, v3.4s, v21.4s + add v18.4s, v25.4s, v24.4s + add v25.4s, v26.4s, v16.4s + eor v4.16b, v2.16b, v4.16b + eor v0.16b, v3.16b, v0.16b + ushr v26.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + ushr v27.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - ushr v29.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ushr v30.4s, v1.4s, #12 + eor v1.16b, v18.16b, v1.16b + eor v17.16b, v25.16b, v17.16b + add v5.4s, v5.4s, v14.4s + add v6.4s, v6.4s, v10.4s + orr v4.16b, v4.16b, v26.16b + orr v0.16b, v0.16b, v27.16b + ushr v26.4s, v1.4s, #12 shl v1.4s, v1.4s, #20 - ushr v31.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - add v17.4s, v17.4s, v22.4s - ldr q22, [sp, #240] - orr v0.16b, v0.16b, v28.16b - prfm pldl1keep, [x23, #256] - orr v2.16b, v2.16b, v29.16b - prfm pldl1keep, [x24, #256] - orr v1.16b, v1.16b, v30.16b - prfm pldl1keep, [x22, #256] - orr v5.16b, v5.16b, v31.16b - prfm pldl1keep, [x25, #256] - add v23.4s, v23.4s, v24.4s - add v20.4s, v20.4s, v22.4s - add v3.4s, v3.4s, v2.4s - add v17.4s, v17.4s, v1.4s - add v22.4s, v23.4s, v0.4s - add v20.4s, v20.4s, v5.4s - eor v23.16b, v25.16b, v3.16b - eor v16.16b, v16.16b, v17.16b - eor v4.16b, v4.16b, v22.16b - eor v6.16b, v6.16b, v20.16b - tbl v23.16b, { v23.16b }, v18.16b - tbl v16.16b, { v16.16b }, v18.16b - tbl v4.16b, { v4.16b }, v18.16b - tbl v6.16b, { v6.16b }, v18.16b - add v7.4s, v7.4s, v23.4s - add v19.4s, v19.4s, v16.4s - add v18.4s, v26.4s, v4.4s - add v21.4s, v21.4s, v6.4s - eor v2.16b, v7.16b, v2.16b - eor v1.16b, v19.16b, v1.16b - eor v0.16b, v18.16b, v0.16b - eor v5.16b, v21.16b, v5.16b - ushr v25.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ushr v24.4s, v0.4s, #7 + ushr v27.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v19.4s, v19.4s, v13.4s + add v20.4s, v20.4s, v7.4s + add v5.4s, v5.4s, v4.4s + add v6.4s, v6.4s, v0.4s + orr v1.16b, v1.16b, v26.16b + orr v17.16b, v17.16b, v27.16b + add v19.4s, v19.4s, v1.4s + add v20.4s, v20.4s, v17.4s + eor v22.16b, v23.16b, v5.16b + eor v21.16b, v21.16b, v6.16b + tbl v26.16b, { v22.16b }, v9.16b + tbl v21.16b, { v21.16b }, v9.16b + eor v22.16b, v24.16b, v19.16b + eor v16.16b, v16.16b, v20.16b + tbl v23.16b, { v22.16b }, v9.16b + tbl v24.16b, { v16.16b }, v9.16b + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v21.4s + add v7.4s, v18.4s, v23.4s + add v16.4s, v25.4s, v24.4s + eor v4.16b, v2.16b, v4.16b + eor v0.16b, v3.16b, v0.16b + ushr v18.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ushr v22.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ushr v26.4s, v1.4s, #7 + eor v1.16b, v7.16b, v1.16b + eor v17.16b, v16.16b, v17.16b + orr v4.16b, v4.16b, v18.16b + orr v0.16b, v0.16b, v22.16b + ushr v18.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - ushr v27.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v0.16b, v0.16b, v24.16b - orr v2.16b, v2.16b, v25.16b - orr v1.16b, v1.16b, v26.16b - orr v5.16b, v5.16b, v27.16b - movi v13.4s, #64 - eor v29.16b, v19.16b, v22.16b - eor v8.16b, v21.16b, v3.16b - eor v30.16b, v17.16b, v18.16b - eor v31.16b, v20.16b, v7.16b - eor v24.16b, v5.16b, v23.16b - eor v18.16b, v0.16b, v16.16b - eor v25.16b, v2.16b, v6.16b - eor v26.16b, v1.16b, v4.16b - cbnz x21, .LBB3_5 + ushr v22.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v1.16b, v1.16b, v18.16b + orr v17.16b, v17.16b, v22.16b + eor v27.16b, v7.16b, v5.16b + eor v29.16b, v16.16b, v6.16b + eor v16.16b, v19.16b, v2.16b + eor v28.16b, v20.16b, v3.16b + eor v22.16b, v17.16b, v21.16b + eor v7.16b, v4.16b, v23.16b + eor v11.16b, v0.16b, v24.16b + eor v13.16b, v1.16b, v26.16b + cbnz x3, .LBB3_5 + ldr x3, [sp, #40] b .LBB3_2 -.LBB3_6: - cbz x1, .LBB3_14 - adrp x12, .LCPI3_3 - ldr q0, [x11, :lo12:.LCPI3_1] - orr w11, w7, w6 - ldr q2, [x10, :lo12:.LCPI3_2] - ldr q1, [x12, :lo12:.LCPI3_3] - and x12, x5, #0x1 -.LBB3_8: - movi v3.4s, #64 - lsr x13, x4, #32 - ldp q5, q4, [x3] - mov x15, x2 - mov w14, w11 - mov v3.s[0], w4 - ldr x10, [x0] - mov v3.s[1], w13 - b .LBB3_11 +.LBB3_7: + cbz x23, .LBB3_18 + orr w9, w7, w19 + orr w22, w8, w19 + and x28, x5, #0x1 + stur w9, [x29, #-192] + orr w9, w9, w8 + stur w9, [x29, #-208] + b .LBB3_10 .LBB3_9: - orr w14, w14, w9 + ldp q1, q0, [x29, #-176] + add x20, x20, x28 + add x24, x24, #8 + subs x23, x23, #1 + ldr x3, [sp, #40] + stp q1, q0, [x26], #32 + b.eq .LBB3_18 .LBB3_10: - ldp q6, q7, [x10] - mov v16.16b, v3.16b - and w14, w14, #0xff - add v5.4s, v5.4s, v4.4s - mov x15, x13 - mov v16.s[3], w14 - add x14, x10, #32 - uzp1 v17.4s, v6.4s, v7.4s - add x10, x10, #64 - add v5.4s, v5.4s, v17.4s - eor v16.16b, v5.16b, v16.16b - tbl v16.16b, { v16.16b }, v0.16b - add v18.4s, v16.4s, v1.4s - eor v19.16b, v18.16b, v4.16b - uzp2 v4.4s, v6.4s, v7.4s - ushr v6.4s, v19.4s, #12 - shl v7.4s, v19.4s, #20 - ld2 { v19.4s, v20.4s }, [x14] - add v5.4s, v5.4s, v4.4s - mov w14, w6 - orr v6.16b, v7.16b, v6.16b - add v5.4s, v5.4s, v6.4s - eor v7.16b, v16.16b, v5.16b - add v5.4s, v5.4s, v19.4s - tbl v7.16b, { v7.16b }, v2.16b - ext v5.16b, v5.16b, v5.16b, #12 - add v16.4s, v18.4s, v7.4s - ext v7.16b, v7.16b, v7.16b, #8 - eor v6.16b, v6.16b, v16.16b - ext v16.16b, v16.16b, v16.16b, #4 - ushr v18.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - orr v6.16b, v6.16b, v18.16b - ext v18.16b, v20.16b, v20.16b, #12 - add v5.4s, v5.4s, v6.4s - eor v7.16b, v5.16b, v7.16b - add v5.4s, v5.4s, v18.4s - tbl v7.16b, { v7.16b }, v0.16b - add v16.4s, v16.4s, v7.4s - eor v6.16b, v6.16b, v16.16b - ushr v21.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - orr v6.16b, v6.16b, v21.16b - uzp1 v21.4s, v17.4s, v17.4s - add v5.4s, v5.4s, v6.4s - ext v21.16b, v21.16b, v17.16b, #8 - eor v7.16b, v7.16b, v5.16b - uzp2 v21.4s, v21.4s, v4.4s - tbl v7.16b, { v7.16b }, v2.16b - add v5.4s, v5.4s, v21.4s - add v16.4s, v16.4s, v7.4s - ext v5.16b, v5.16b, v5.16b, #4 - ext v7.16b, v7.16b, v7.16b, #8 - eor v6.16b, v6.16b, v16.16b - ushr v22.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - orr v6.16b, v6.16b, v22.16b - add v22.4s, v5.4s, v6.4s - eor v5.16b, v22.16b, v7.16b - ext v7.16b, v16.16b, v16.16b, #12 - tbl v16.16b, { v5.16b }, v0.16b - ext v5.16b, v17.16b, v17.16b, #12 - add v7.4s, v7.4s, v16.4s - ext v5.16b, v17.16b, v5.16b, #12 - ext v17.16b, v19.16b, v19.16b, #12 - mov v19.16b, v18.16b - eor v6.16b, v6.16b, v7.16b - rev64 v5.4s, v5.4s - mov v19.s[1], v17.s[2] - ushr v20.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - trn2 v5.4s, v5.4s, v19.4s - orr v6.16b, v6.16b, v20.16b - zip1 v20.2d, v18.2d, v4.2d - zip2 v4.4s, v4.4s, v18.4s - add v19.4s, v6.4s, v5.4s - mov v20.s[3], v17.s[3] - add v19.4s, v19.4s, v22.4s - ext v22.16b, v20.16b, v20.16b, #12 - eor v16.16b, v16.16b, v19.16b - ext v19.16b, v19.16b, v19.16b, #12 - tbl v16.16b, { v16.16b }, v2.16b - add v7.4s, v7.4s, v16.4s - ext v16.16b, v16.16b, v16.16b, #8 - eor v6.16b, v6.16b, v7.16b - ext v7.16b, v7.16b, v7.16b, #4 - ushr v23.4s, v6.4s, #7 - shl v24.4s, v6.4s, #25 - uzp1 v6.4s, v20.4s, v22.4s - orr v20.16b, v24.16b, v23.16b - add v22.4s, v20.4s, v6.4s - add v19.4s, v22.4s, v19.4s - eor v16.16b, v19.16b, v16.16b - tbl v16.16b, { v16.16b }, v0.16b - add v7.4s, v7.4s, v16.4s - eor v18.16b, v20.16b, v7.16b - zip1 v20.4s, v4.4s, v17.4s - zip1 v4.4s, v17.4s, v4.4s - ushr v17.4s, v18.4s, #12 - shl v18.4s, v18.4s, #20 - ext v20.16b, v4.16b, v20.16b, #8 - orr v4.16b, v18.16b, v17.16b - ext v18.16b, v21.16b, v21.16b, #4 - add v17.4s, v4.4s, v20.4s - add v17.4s, v17.4s, v19.4s - uzp1 v19.4s, v18.4s, v18.4s - eor v16.16b, v16.16b, v17.16b - ext v19.16b, v19.16b, v18.16b, #8 - tbl v16.16b, { v16.16b }, v2.16b - uzp2 v19.4s, v19.4s, v5.4s - add v7.4s, v7.4s, v16.4s - add v17.4s, v17.4s, v19.4s - ext v16.16b, v16.16b, v16.16b, #8 - eor v4.16b, v4.16b, v7.16b - ext v17.16b, v17.16b, v17.16b, #4 - ext v7.16b, v7.16b, v7.16b, #12 - ushr v21.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - orr v4.16b, v4.16b, v21.16b - ext v21.16b, v18.16b, v18.16b, #12 - add v17.4s, v17.4s, v4.4s - ext v18.16b, v18.16b, v21.16b, #12 - mov v21.16b, v20.16b - eor v16.16b, v17.16b, v16.16b - rev64 v18.4s, v18.4s - mov v21.s[1], v6.s[2] - tbl v16.16b, { v16.16b }, v0.16b - add v7.4s, v7.4s, v16.4s - eor v4.16b, v4.16b, v7.16b - ushr v22.4s, v4.4s, #12 - shl v23.4s, v4.4s, #20 - trn2 v4.4s, v18.4s, v21.4s - orr v18.16b, v23.16b, v22.16b - add v21.4s, v18.4s, v4.4s - add v17.4s, v21.4s, v17.4s - zip1 v21.2d, v20.2d, v5.2d - zip2 v5.4s, v5.4s, v20.4s - eor v16.16b, v16.16b, v17.16b - mov v21.s[3], v6.s[3] - ext v17.16b, v17.16b, v17.16b, #12 - zip1 v20.4s, v5.4s, v6.4s - tbl v16.16b, { v16.16b }, v2.16b - zip1 v5.4s, v6.4s, v5.4s - add v22.4s, v7.4s, v16.4s - ext v16.16b, v16.16b, v16.16b, #8 - ext v20.16b, v5.16b, v20.16b, #8 - eor v7.16b, v18.16b, v22.16b - ext v18.16b, v21.16b, v21.16b, #12 - ushr v23.4s, v7.4s, #7 - shl v24.4s, v7.4s, #25 - uzp1 v7.4s, v21.4s, v18.4s - orr v18.16b, v24.16b, v23.16b - add v21.4s, v18.4s, v7.4s - add v17.4s, v21.4s, v17.4s - ext v21.16b, v22.16b, v22.16b, #4 - eor v16.16b, v17.16b, v16.16b - tbl v16.16b, { v16.16b }, v0.16b - add v21.4s, v21.4s, v16.4s - eor v18.16b, v18.16b, v21.16b - ushr v6.4s, v18.4s, #12 - shl v18.4s, v18.4s, #20 - orr v5.16b, v18.16b, v6.16b - add v6.4s, v5.4s, v20.4s - add v6.4s, v6.4s, v17.4s - ext v17.16b, v19.16b, v19.16b, #4 - eor v16.16b, v16.16b, v6.16b - uzp1 v18.4s, v17.4s, v17.4s - tbl v16.16b, { v16.16b }, v2.16b - ext v18.16b, v18.16b, v17.16b, #8 - add v19.4s, v21.4s, v16.4s - uzp2 v18.4s, v18.4s, v4.4s - ext v16.16b, v16.16b, v16.16b, #8 - eor v5.16b, v5.16b, v19.16b - add v6.4s, v6.4s, v18.4s - ext v19.16b, v19.16b, v19.16b, #12 - ushr v21.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v5.16b, v5.16b, v21.16b - ext v21.16b, v17.16b, v17.16b, #12 - add v6.4s, v6.4s, v5.4s - ext v17.16b, v17.16b, v21.16b, #12 - mov v21.16b, v20.16b - eor v16.16b, v6.16b, v16.16b - rev64 v17.4s, v17.4s - mov v21.s[1], v7.s[2] - tbl v16.16b, { v16.16b }, v0.16b - add v19.4s, v19.4s, v16.4s - eor v5.16b, v5.16b, v19.16b - ushr v22.4s, v5.4s, #12 - shl v23.4s, v5.4s, #20 - trn2 v5.4s, v17.4s, v21.4s - orr v17.16b, v23.16b, v22.16b - add v21.4s, v17.4s, v5.4s - add v6.4s, v21.4s, v6.4s - eor v16.16b, v16.16b, v6.16b - ext v6.16b, v6.16b, v6.16b, #12 - tbl v21.16b, { v16.16b }, v2.16b - zip1 v16.2d, v20.2d, v4.2d - zip2 v4.4s, v4.4s, v20.4s - add v19.4s, v19.4s, v21.4s - mov v16.s[3], v7.s[3] - ext v21.16b, v21.16b, v21.16b, #8 - zip1 v20.4s, v4.4s, v7.4s - eor v17.16b, v17.16b, v19.16b - ext v22.16b, v16.16b, v16.16b, #12 - ext v19.16b, v19.16b, v19.16b, #4 - zip1 v4.4s, v7.4s, v4.4s - ushr v23.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - uzp1 v16.4s, v16.4s, v22.4s - ext v4.16b, v4.16b, v20.16b, #8 - orr v17.16b, v17.16b, v23.16b - add v22.4s, v17.4s, v16.4s - add v6.4s, v22.4s, v6.4s - eor v21.16b, v6.16b, v21.16b - tbl v21.16b, { v21.16b }, v0.16b - add v19.4s, v19.4s, v21.4s - eor v17.16b, v17.16b, v19.16b - ushr v7.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - orr v7.16b, v17.16b, v7.16b - add v17.4s, v7.4s, v4.4s - add v6.4s, v17.4s, v6.4s - ext v17.16b, v18.16b, v18.16b, #4 - eor v18.16b, v21.16b, v6.16b - uzp1 v20.4s, v17.4s, v17.4s - tbl v18.16b, { v18.16b }, v2.16b - ext v20.16b, v20.16b, v17.16b, #8 - add v19.4s, v19.4s, v18.4s - uzp2 v20.4s, v20.4s, v5.4s - ext v18.16b, v18.16b, v18.16b, #8 - eor v7.16b, v7.16b, v19.16b - add v6.4s, v6.4s, v20.4s - ushr v21.4s, v7.4s, #7 - shl v7.4s, v7.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v7.16b, v7.16b, v21.16b - add v21.4s, v6.4s, v7.4s - eor v6.16b, v21.16b, v18.16b - ext v18.16b, v19.16b, v19.16b, #12 - tbl v19.16b, { v6.16b }, v0.16b - ext v6.16b, v17.16b, v17.16b, #12 - add v18.4s, v18.4s, v19.4s - ext v6.16b, v17.16b, v6.16b, #12 - mov v17.16b, v4.16b - eor v7.16b, v7.16b, v18.16b - rev64 v6.4s, v6.4s - mov v17.s[1], v16.s[2] - ushr v22.4s, v7.4s, #12 - shl v7.4s, v7.4s, #20 - trn2 v6.4s, v6.4s, v17.4s - orr v7.16b, v7.16b, v22.16b - add v17.4s, v7.4s, v6.4s - add v17.4s, v17.4s, v21.4s - zip1 v21.2d, v4.2d, v5.2d - zip2 v4.4s, v5.4s, v4.4s - eor v19.16b, v19.16b, v17.16b - mov v21.s[3], v16.s[3] - ext v17.16b, v17.16b, v17.16b, #12 - tbl v19.16b, { v19.16b }, v2.16b - ext v22.16b, v21.16b, v21.16b, #12 - add v18.4s, v18.4s, v19.4s - ext v19.16b, v19.16b, v19.16b, #8 - eor v7.16b, v7.16b, v18.16b - ext v18.16b, v18.16b, v18.16b, #4 - ushr v23.4s, v7.4s, #7 - shl v24.4s, v7.4s, #25 - uzp1 v7.4s, v21.4s, v22.4s - orr v21.16b, v24.16b, v23.16b - add v22.4s, v21.4s, v7.4s - add v17.4s, v22.4s, v17.4s - eor v19.16b, v17.16b, v19.16b - tbl v19.16b, { v19.16b }, v0.16b - add v18.4s, v18.4s, v19.4s - eor v5.16b, v21.16b, v18.16b - zip1 v21.4s, v4.4s, v16.4s - zip1 v4.4s, v16.4s, v4.4s - ushr v16.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v21.16b, v4.16b, v21.16b, #8 - orr v4.16b, v5.16b, v16.16b - ext v16.16b, v20.16b, v20.16b, #4 - mov v23.16b, v21.16b - add v5.4s, v4.4s, v21.4s - mov v23.s[1], v7.s[2] - add v5.4s, v5.4s, v17.4s - eor v17.16b, v19.16b, v5.16b - uzp1 v19.4s, v16.4s, v16.4s - tbl v17.16b, { v17.16b }, v2.16b - ext v19.16b, v19.16b, v16.16b, #8 - add v18.4s, v18.4s, v17.4s - uzp2 v19.4s, v19.4s, v6.4s - eor v4.16b, v4.16b, v18.16b - add v5.4s, v5.4s, v19.4s - ext v19.16b, v19.16b, v19.16b, #4 - ushr v20.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v4.16b, v20.16b - ext v4.16b, v17.16b, v17.16b, #8 - add v17.4s, v5.4s, v20.4s - ext v5.16b, v18.16b, v18.16b, #12 - eor v4.16b, v17.16b, v4.16b - tbl v18.16b, { v4.16b }, v0.16b - ext v4.16b, v16.16b, v16.16b, #12 - add v22.4s, v5.4s, v18.4s - ext v4.16b, v16.16b, v4.16b, #12 - eor v5.16b, v20.16b, v22.16b - rev64 v16.4s, v4.4s - ushr v20.4s, v5.4s, #12 - shl v24.4s, v5.4s, #20 - trn2 v5.4s, v16.4s, v23.4s - orr v16.16b, v24.16b, v20.16b - add v20.4s, v16.4s, v5.4s - add v17.4s, v20.4s, v17.4s - zip1 v20.2d, v21.2d, v6.2d - zip2 v6.4s, v6.4s, v21.4s - eor v18.16b, v18.16b, v17.16b - mov v20.s[3], v7.s[3] - ext v17.16b, v17.16b, v17.16b, #12 - zip1 v21.4s, v6.4s, v7.4s - tbl v18.16b, { v18.16b }, v2.16b - ext v24.16b, v20.16b, v20.16b, #12 - zip1 v6.4s, v7.4s, v6.4s - add v22.4s, v22.4s, v18.4s - ext v18.16b, v18.16b, v18.16b, #8 - ext v6.16b, v6.16b, v21.16b, #8 - eor v16.16b, v16.16b, v22.16b - ext v22.16b, v22.16b, v22.16b, #4 - zip1 v5.2d, v6.2d, v5.2d - zip2 v4.4s, v4.4s, v6.4s - ushr v25.4s, v16.4s, #7 - shl v26.4s, v16.4s, #25 - uzp1 v16.4s, v20.4s, v24.4s - orr v20.16b, v26.16b, v25.16b - mov v5.s[3], v16.s[3] - add v24.4s, v20.4s, v16.4s - add v17.4s, v24.4s, v17.4s - eor v18.16b, v17.16b, v18.16b - tbl v18.16b, { v18.16b }, v0.16b - add v22.4s, v22.4s, v18.4s - eor v20.16b, v20.16b, v22.16b - ushr v7.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v7.16b, v20.16b, v7.16b - add v20.4s, v7.4s, v6.4s - add v17.4s, v20.4s, v17.4s - ext v20.16b, v19.16b, v19.16b, #8 - eor v18.16b, v18.16b, v17.16b - ext v17.16b, v17.16b, v17.16b, #4 - tbl v18.16b, { v18.16b }, v2.16b - add v21.4s, v22.4s, v18.4s - uzp2 v22.4s, v20.4s, v23.4s - ext v18.16b, v18.16b, v18.16b, #8 - eor v7.16b, v7.16b, v21.16b - ext v20.16b, v22.16b, v20.16b, #4 - ushr v22.4s, v7.4s, #7 - shl v7.4s, v7.4s, #25 - add v17.4s, v17.4s, v20.4s - ext v20.16b, v21.16b, v21.16b, #12 - ext v21.16b, v19.16b, v19.16b, #12 - orr v7.16b, v7.16b, v22.16b - ext v19.16b, v19.16b, v21.16b, #12 - add v17.4s, v17.4s, v7.4s - mov v21.16b, v6.16b - rev64 v19.4s, v19.4s - eor v18.16b, v17.16b, v18.16b - mov v21.s[1], v16.s[2] - tbl v18.16b, { v18.16b }, v0.16b - trn2 v19.4s, v19.4s, v21.4s - add v20.4s, v20.4s, v18.4s - eor v7.16b, v7.16b, v20.16b - ushr v22.4s, v7.4s, #12 - shl v7.4s, v7.4s, #20 - orr v7.16b, v7.16b, v22.16b - add v19.4s, v7.4s, v19.4s - add v17.4s, v19.4s, v17.4s - eor v18.16b, v18.16b, v17.16b - ext v17.16b, v17.16b, v17.16b, #12 - tbl v18.16b, { v18.16b }, v2.16b - add v19.4s, v20.4s, v18.4s - ext v20.16b, v5.16b, v5.16b, #12 - ext v18.16b, v18.16b, v18.16b, #8 - eor v7.16b, v7.16b, v19.16b - uzp1 v5.4s, v5.4s, v20.4s - ushr v21.4s, v7.4s, #7 - shl v7.4s, v7.4s, #25 - orr v7.16b, v7.16b, v21.16b - add v5.4s, v7.4s, v5.4s - add v5.4s, v5.4s, v17.4s - eor v17.16b, v5.16b, v18.16b - ext v18.16b, v19.16b, v19.16b, #4 - tbl v17.16b, { v17.16b }, v0.16b - add v18.4s, v18.4s, v17.4s - eor v6.16b, v7.16b, v18.16b - zip1 v7.4s, v4.4s, v16.4s - zip1 v4.4s, v16.4s, v4.4s - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - ext v4.16b, v4.16b, v7.16b, #8 - orr v6.16b, v6.16b, v16.16b - add v4.4s, v6.4s, v4.4s - add v4.4s, v4.4s, v5.4s - eor v5.16b, v17.16b, v4.16b - ext v4.16b, v4.16b, v4.16b, #4 - tbl v5.16b, { v5.16b }, v2.16b - add v7.4s, v18.4s, v5.4s - eor v6.16b, v6.16b, v7.16b - ext v7.16b, v7.16b, v7.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - orr v6.16b, v6.16b, v16.16b - ext v16.16b, v5.16b, v5.16b, #8 - eor v5.16b, v4.16b, v7.16b - eor v4.16b, v6.16b, v16.16b -.LBB3_11: - subs x13, x15, #1 - b.eq .LBB3_9 - cbnz x15, .LBB3_10 - add x4, x4, x12 - add x0, x0, #8 - subs x1, x1, #1 - stp q5, q4, [x8], #32 - b.ne .LBB3_8 + ldp q0, q1, [x21] + ldr x25, [x24] + stp q0, q1, [x29, #-176] + cbz x3, .LBB3_9 + ldr x27, [sp, #40] + ldur w5, [x29, #-192] + cmp x27, #1 + b.ne .LBB3_13 + ldur w5, [x29, #-208] +.LBB3_13: + sub x0, x29, #144 + sub x1, x29, #176 + mov x2, x25 + mov w3, #64 + mov x4, x20 + bl compress_pre + ldp q0, q1, [x29, #-144] + add x25, x25, #64 + ldp q2, q3, [x29, #-112] + b .LBB3_16 .LBB3_14: - add sp, sp, #368 - ldp x20, x19, [sp, #128] - ldp x22, x21, [sp, #112] - ldp x24, x23, [sp, #96] - ldp x26, x25, [sp, #80] - ldp x29, x27, [sp, #64] + mov w5, w22 +.LBB3_15: + sub x0, x29, #144 + sub x1, x29, #176 + mov x2, x25 + mov w3, #64 + mov x4, x20 + bl compress_pre + ldp q0, q1, [x29, #-144] + add x25, x25, #64 + sub x27, x27, #1 + ldp q2, q3, [x29, #-112] +.LBB3_16: + eor v0.16b, v2.16b, v0.16b + cmp x27, #2 + eor v1.16b, v3.16b, v1.16b + stp q0, q1, [x29, #-176] + b.eq .LBB3_14 + mov w5, w19 + cmp x27, #1 + b.ne .LBB3_15 + b .LBB3_9 +.LBB3_18: + add sp, sp, #432 + .cfi_def_cfa wsp, 160 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] ldp d9, d8, [sp, #48] ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] - ldp d15, d14, [sp], #144 + ldp d15, d14, [sp], #160 + .cfi_def_cfa_offset 0 + .cfi_restore w19 + .cfi_restore w20 + .cfi_restore w21 + .cfi_restore w22 + .cfi_restore w23 + .cfi_restore w24 + .cfi_restore w25 + .cfi_restore w26 + .cfi_restore w27 + .cfi_restore w28 + .cfi_restore w30 + .cfi_restore w29 + .cfi_restore b8 + .cfi_restore b9 + .cfi_restore b10 + .cfi_restore b11 + .cfi_restore b12 + .cfi_restore b13 + .cfi_restore b14 + .cfi_restore b15 ret .Lfunc_end3: .size zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41