Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix assembly for x32 ABI #438

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions c/blake3_avx2_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ blake3_hash_many_avx2:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -73,6 +74,16 @@ blake3_hash_many_avx2:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x10]
mov r13d, dword ptr [rdi+0x14]
mov r14d, dword ptr [rdi+0x18]
mov r15d, dword ptr [rdi+0x1c]
#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
Expand Down Expand Up @@ -1293,7 +1304,11 @@ blake3_hash_many_avx2:
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
#ifndef __ILP32__
add rdi, 64
#else
add rdi, 32
#endif
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
Expand Down Expand Up @@ -1334,10 +1349,17 @@ blake3_hash_many_avx2:
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1545,7 +1567,11 @@ blake3_hash_many_avx2:
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
3:
test rsi, 0x2
Expand All @@ -1561,8 +1587,13 @@ blake3_hash_many_avx2:
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1671,7 +1702,11 @@ blake3_hash_many_avx2:
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
sub rsi, 2
3:
test rsi, 0x1
Expand All @@ -1683,7 +1718,11 @@ blake3_hash_many_avx2:
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
87 changes: 87 additions & 0 deletions c/blake3_avx512_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ blake3_hash_many_avx512:
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -97,6 +98,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x20]
mov r13d, dword ptr [rdi+0x24]
mov r14d, dword ptr [rdi+0x28]
mov r15d, dword ptr [rdi+0x2c]
#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
Expand All @@ -109,6 +120,7 @@ blake3_hash_many_avx512:
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
#ifndef __ILP32__
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
Expand All @@ -117,6 +129,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
#else
mov r8d, dword ptr [rdi+0x10]
mov r9d, dword ptr [rdi+0x14]
mov r10d, dword ptr [rdi+0x18]
mov r11d, dword ptr [rdi+0x1c]
mov r12d, dword ptr [rdi+0x30]
mov r13d, dword ptr [rdi+0x34]
mov r14d, dword ptr [rdi+0x38]
mov r15d, dword ptr [rdi+0x3c]
#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
Expand Down Expand Up @@ -151,6 +173,7 @@ blake3_hash_many_avx512:
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -159,6 +182,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x20]
mov r13d, dword ptr [rdi+0x24]
mov r14d, dword ptr [rdi+0x28]
mov r15d, dword ptr [rdi+0x2c]
#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
Expand All @@ -179,6 +212,7 @@ blake3_hash_many_avx512:
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
#ifndef __ILP32__
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
Expand All @@ -187,6 +221,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
#else
mov r8d, dword ptr [rdi+0x10]
mov r9d, dword ptr [rdi+0x14]
mov r10d, dword ptr [rdi+0x18]
mov r11d, dword ptr [rdi+0x1c]
mov r12d, dword ptr [rdi+0x30]
mov r13d, dword ptr [rdi+0x34]
mov r14d, dword ptr [rdi+0x38]
mov r15d, dword ptr [rdi+0x3c]
#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
Expand Down Expand Up @@ -1077,7 +1121,11 @@ blake3_hash_many_avx512:
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
#ifndef __ILP32__
add rdi, 128
#else
add rdi, 64
#endif
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
Expand Down Expand Up @@ -1107,6 +1155,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
Expand All @@ -1115,6 +1164,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
mov r12d, dword ptr [rdi+0x10]
mov r13d, dword ptr [rdi+0x14]
mov r14d, dword ptr [rdi+0x18]
mov r15d, dword ptr [rdi+0x1c]
#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
Expand Down Expand Up @@ -2037,7 +2096,11 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
#ifndef __ILP32__
add rdi, 64
#else
add rdi, 32
#endif
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
Expand All @@ -2060,10 +2123,17 @@ blake3_hash_many_avx512:
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
mov eax, 43690
kmovw k3, eax
mov eax, 34952
Expand Down Expand Up @@ -2177,7 +2247,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
3:
test esi, 0x2
Expand All @@ -2191,8 +2265,13 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -2290,7 +2369,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
sub rsi, 2
3:
test esi, 0x1
Expand All @@ -2301,7 +2384,11 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
#ifndef __ILP32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
24 changes: 24 additions & 0 deletions c/blake3_sse2_x86-64_unix.S
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,17 @@ blake3_hash_many_sse2:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
mov r10d, dword ptr [rdi+0x8]
mov r11d, dword ptr [rdi+0xc]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1632,7 +1639,11 @@ blake3_hash_many_sse2:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
#ifndef __ILP32__
add rdi, 32
#else
add rdi, 16
#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
Expand Down Expand Up @@ -1663,8 +1674,13 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
#ifndef __ILP32__
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
#else
mov r8d, dword ptr [rdi]
mov r9d, dword ptr [rdi+0x4]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down Expand Up @@ -1893,7 +1909,11 @@ blake3_hash_many_sse2:
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
#ifndef __ILP32__
add rdi, 16
#else
add rdi, 8
#endif
add rbx, 64
sub rsi, 2
3:
Expand All @@ -1904,7 +1924,11 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
#ifndef __IPL32__
mov r8, qword ptr [rdi]
#else
mov r8d, dword ptr [rdi]
#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
Expand Down
Loading
Loading