Skip to content

Commit

Permalink
Fix assembly for x32 ABI
Browse files Browse the repository at this point in the history
  • Loading branch information
abatyiev committed Jan 3, 2025
1 parent 5c8b350 commit 1a55513
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 0 deletions.
39 changes: 39 additions & 0 deletions c/blake3_avx2_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ blake3_hash_many_avx2:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -73,6 +74,16 @@ blake3_hash_many_avx2:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x10]
mov r13d, dword ptr [rdi+0x14]
mov r14d, dword ptr [rdi+0x18]
mov r15d, dword ptr [rdi+0x1c]
#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
Expand Down Expand Up @@ -1293,7 +1304,11 @@ blake3_hash_many_avx2:
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
#ifndef __ILP32__
add rdi, 64
#else
add rdi, 32
#endif
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
Expand Down Expand Up @@ -1334,10 +1349,17 @@ blake3_hash_many_avx2:
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1545,7 +1567,11 @@ blake3_hash_many_avx2:
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
3:
test rsi, 0x2
Expand All @@ -1561,8 +1587,13 @@ blake3_hash_many_avx2:
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1671,7 +1702,11 @@ blake3_hash_many_avx2:
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
sub rsi, 2
3:
test rsi, 0x1
Expand All @@ -1683,7 +1718,11 @@ blake3_hash_many_avx2:
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
87 changes: 87 additions & 0 deletions c/blake3_avx512_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ blake3_hash_many_avx512:
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -97,6 +98,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x20]
mov r13d, dword ptr [rdi+0x24]
mov r14d, dword ptr [rdi+0x28]
mov r15d, dword ptr [rdi+0x2c]
#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
Expand All @@ -109,6 +120,7 @@ blake3_hash_many_avx512:
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
#ifndef __ILP32__
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
Expand All @@ -117,6 +129,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
#else
mov r8d, dword ptr [rdi+0x10]
mov r9d, dword ptr [rdi+0x14]
mov r10d, dword ptr [rdi+0x18]
mov r11d, dword ptr [rdi+0x1c]
mov r12d, dword ptr [rdi+0x30]
mov r13d, dword ptr [rdi+0x34]
mov r14d, dword ptr [rdi+0x38]
mov r15d, dword ptr [rdi+0x3c]
#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
Expand Down Expand Up @@ -151,6 +173,7 @@ blake3_hash_many_avx512:
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -159,6 +182,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x20]
mov r13d, dword ptr [rdi+0x24]
mov r14d, dword ptr [rdi+0x28]
mov r15d, dword ptr [rdi+0x2c]
#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
Expand All @@ -179,6 +212,7 @@ blake3_hash_many_avx512:
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
#ifndef __ILP32__
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
Expand All @@ -187,6 +221,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
#else
mov r8d, dword ptr [rdi+0x10]
mov r9d, dword ptr [rdi+0x14]
mov r10d, dword ptr [rdi+0x18]
mov r11d, dword ptr [rdi+0x1c]
mov r12d, dword ptr [rdi+0x30]
mov r13d, dword ptr [rdi+0x34]
mov r14d, dword ptr [rdi+0x38]
mov r15d, dword ptr [rdi+0x3c]
#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
Expand Down Expand Up @@ -1077,7 +1121,11 @@ blake3_hash_many_avx512:
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
#ifndef __ILP32__
add rdi, 128
#else
add rdi, 64
#endif
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
Expand Down Expand Up @@ -1107,6 +1155,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -1115,6 +1164,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x10]
mov r13d, dword ptr [rdi+0x14]
mov r14d, dword ptr [rdi+0x18]
mov r15d, dword ptr [rdi+0x1c]
#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
Expand Down Expand Up @@ -2037,7 +2096,11 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
#ifndef __ILP32__
add rdi, 64
#else
add rdi, 32
#endif
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
Expand All @@ -2060,10 +2123,17 @@ blake3_hash_many_avx512:
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
mov eax, 43690
kmovw k3, eax
mov eax, 34952
Expand Down Expand Up @@ -2177,7 +2247,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
3:
test esi, 0x2
Expand All @@ -2191,8 +2265,13 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -2290,7 +2369,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
sub rsi, 2
3:
test esi, 0x1
Expand All @@ -2301,7 +2384,11 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
24 changes: 24 additions & 0 deletions c/blake3_sse2_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,17 @@ blake3_hash_many_sse2:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1632,7 +1639,11 @@ blake3_hash_many_sse2:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
Expand Down Expand Up @@ -1663,8 +1674,13 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1893,7 +1909,11 @@ blake3_hash_many_sse2:
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
add rbx, 64
sub rsi, 2
3:
Expand All @@ -1904,7 +1924,11 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
#ifndef __IPL32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
Loading

0 comments on commit 1a55513

Please sign in to comment.