Skip to content

Commit

Permalink
Fix ARM64 assembly (#19)
Browse files Browse the repository at this point in the history
The wrong constants were used for ARM64, leading to wrong values being calculated.
This is likely due to a linker change or similar.

* Rename to less generic names.
* Use textflag.h
* Apply asmfmt

Fixes #17

* Re-enable asm...
  • Loading branch information
klauspost authored Mar 25, 2021
1 parent 5311fe9 commit 08ce0b4
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 105 deletions.
38 changes: 19 additions & 19 deletions highwayhash_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@

#include "textflag.h"

DATA ·cons<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
DATA ·cons<>+0x08(SB)/8, $0xa4093822299f31d0
DATA ·cons<>+0x10(SB)/8, $0x13198a2e03707344
DATA ·cons<>+0x18(SB)/8, $0x243f6a8885a308d3
DATA ·cons<>+0x20(SB)/8, $0x3bd39e10cb0ef593
DATA ·cons<>+0x28(SB)/8, $0xc0acf169b5f18a8c
DATA ·cons<>+0x30(SB)/8, $0xbe5466cf34e90c6c
DATA ·cons<>+0x38(SB)/8, $0x452821e638d01377
GLOBL ·cons<>(SB), (NOPTR+RODATA), $64

DATA ·zipperMerge<>+0x00(SB)/8, $0xf010e05020c03
DATA ·zipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
GLOBL ·zipperMerge<>(SB), (NOPTR+RODATA), $16
DATA ·asmConstants<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
DATA ·asmConstants<>+0x08(SB)/8, $0xa4093822299f31d0
DATA ·asmConstants<>+0x10(SB)/8, $0x13198a2e03707344
DATA ·asmConstants<>+0x18(SB)/8, $0x243f6a8885a308d3
DATA ·asmConstants<>+0x20(SB)/8, $0x3bd39e10cb0ef593
DATA ·asmConstants<>+0x28(SB)/8, $0xc0acf169b5f18a8c
DATA ·asmConstants<>+0x30(SB)/8, $0xbe5466cf34e90c6c
DATA ·asmConstants<>+0x38(SB)/8, $0x452821e638d01377
GLOBL ·asmConstants<>(SB), (NOPTR+RODATA), $64

DATA ·asmZipperMerge<>+0x00(SB)/8, $0xf010e05020c03
DATA ·asmZipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
GLOBL ·asmZipperMerge<>(SB), (NOPTR+RODATA), $16

#define v00 X0
#define v01 X1
Expand Down Expand Up @@ -104,10 +104,10 @@ GLOBL ·zipperMerge<>(SB), (NOPTR+RODATA), $16
PADDQ t1, v11

// func initializeSSE4(state *[16]uint64, key []byte)
TEXT ·initializeSSE4(SB), 4, $0-32
TEXT ·initializeSSE4(SB), NOSPLIT, $0-32
MOVQ state+0(FP), AX
MOVQ key_base+8(FP), BX
MOVQcons<>(SB), CX
MOVQasmConstants<>(SB), CX

MOVOU 0(BX), v00
MOVOU 16(BX), v01
Expand Down Expand Up @@ -136,7 +136,7 @@ TEXT ·initializeSSE4(SB), 4, $0-32
RET

// func updateSSE4(state *[16]uint64, msg []byte)
TEXT ·updateSSE4(SB), 4, $0-32
TEXT ·updateSSE4(SB), NOSPLIT, $0-32
MOVQ state+0(FP), AX
MOVQ msg_base+8(FP), BX
MOVQ msg_len+16(FP), CX
Expand All @@ -153,7 +153,7 @@ TEXT ·updateSSE4(SB), 4, $0-32
MOVOU 96(AX), m10
MOVOU 112(AX), m11

MOVOU ·zipperMerge<>(SB), t2
MOVOU ·asmZipperMerge<>(SB), t2

LOOP:
MOVOU 0(BX), t0
Expand All @@ -178,7 +178,7 @@ DONE:
RET

// func finalizeSSE4(out []byte, state *[16]uint64)
TEXT ·finalizeSSE4(SB), 4, $0-32
TEXT ·finalizeSSE4(SB), NOSPLIT, $0-32
MOVQ state+24(FP), AX
MOVQ out_base+0(FP), BX
MOVQ out_len+8(FP), CX
Expand All @@ -192,7 +192,7 @@ TEXT ·finalizeSSE4(SB), 4, $0-32
MOVOU 96(AX), m10
MOVOU 112(AX), m11

MOVOU ·zipperMerge<>(SB), t2
MOVOU ·asmZipperMerge<>(SB), t2

PSHUFD $177, v01, t0
PSHUFD $177, v00, t1
Expand Down
2 changes: 0 additions & 2 deletions highwayhash_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

//+build ignore

//+build !noasm,!appengine

package highwayhash
Expand Down
148 changes: 72 additions & 76 deletions highwayhash_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
// limitations under the License.
//

//+build ignore

//+build !noasm,!appengine

// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
// the opcodes of their Plan9 equivalents

#include "textflag.h"

#define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
MOVD $0x3FFFFFFFFFFFFFFF, tmp0 \
AND tmp0, x3 \
Expand Down Expand Up @@ -50,60 +50,59 @@
EOR x1, y1 \
EOR x3, y1

#define UPDATE(MSG1, MSG2) \
#define UPDATE(MSG1, MSG2) \
\ // Add message
VADD MSG1.D2, V2.D2, V2.D2 \
VADD MSG2.D2, V3.D2, V3.D2 \
\
VADD MSG1.D2, V2.D2, V2.D2 \
VADD MSG2.D2, V3.D2, V3.D2 \
\
\ // v1 += mul0
VADD V4.D2, V2.D2, V2.D2 \
VADD V5.D2, V3.D2, V3.D2 \
\
VADD V4.D2, V2.D2, V2.D2 \
VADD V5.D2, V3.D2, V3.D2 \
\
\ // First pair of multiplies
VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
\
\ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
\ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
\
VTBL V29.B16, [V0.B16, V1.B16], V10.B16 \
VTBL V30.B16, [V2.B16, V3.B16], V11.B16 \
\
\ // VUMULL V10.S2, V11.S2, V12.D2 /* assembler support missing */
\ // VUMULL2 V10.S4, V11.S4, V13.D2 /* assembler support missing */
WORD $0x2eaac16c \ // umull v12.2d, v11.2s, v10.2s
WORD $0x6eaac16d \ // umull2 v13.2d, v11.4s, v10.4s
\
\ // v0 += mul1
VADD V6.D2, V0.D2, V0.D2 \
VADD V7.D2, V1.D2, V1.D2 \
\
VADD V6.D2, V0.D2, V0.D2 \
VADD V7.D2, V1.D2, V1.D2 \
\
\ // Second pair of multiplies
VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
\
VTBL V29.B16, [V2.B16, V3.B16], V15.B16 \
VTBL V30.B16, [V0.B16, V1.B16], V14.B16 \
\
\ // EOR multiplication result in
VEOR V12.B16, V4.B16, V4.B16 \
VEOR V13.B16, V5.B16, V5.B16 \
\
VEOR V12.B16, V4.B16, V4.B16 \
VEOR V13.B16, V5.B16, V5.B16 \
\
\ // VUMULL V14.S2, V15.S2, V16.D2 /* assembler support missing */
\ // VUMULL2 V14.S4, V15.S4, V17.D2 /* assembler support missing */
WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
\
WORD $0x2eaec1f0 \ // umull v16.2d, v15.2s, v14.2s
WORD $0x6eaec1f1 \ // umull2 v17.2d, v15.4s, v14.4s
\
\ // First pair of zipper-merges
VTBL V28.B16, [V2.B16], V18.B16 \
VADD V18.D2, V0.D2, V0.D2 \
VTBL V28.B16, [V3.B16], V19.B16 \
VADD V19.D2, V1.D2, V1.D2 \
\
VTBL V28.B16, [V2.B16], V18.B16 \
VADD V18.D2, V0.D2, V0.D2 \
VTBL V28.B16, [V3.B16], V19.B16 \
VADD V19.D2, V1.D2, V1.D2 \
\
\ // Second pair of zipper-merges
VTBL V28.B16, [V0.B16], V20.B16 \
VADD V20.D2, V2.D2, V2.D2 \
VTBL V28.B16, [V1.B16], V21.B16 \
VADD V21.D2, V3.D2, V3.D2 \
\
VTBL V28.B16, [V0.B16], V20.B16 \
VADD V20.D2, V2.D2, V2.D2 \
VTBL V28.B16, [V1.B16], V21.B16 \
VADD V21.D2, V3.D2, V3.D2 \
\
\ // EOR multiplication result in
VEOR V16.B16, V6.B16, V6.B16 \
VEOR V16.B16, V6.B16, V6.B16 \
VEOR V17.B16, V7.B16, V7.B16


// func initializeArm64(state *[16]uint64, key []byte)
TEXT ·initializeArm64(SB), 7, $0
TEXT ·initializeArm64(SB), NOSPLIT, $0
MOVD state+0(FP), R0
MOVD key_base+8(FP), R1

Expand All @@ -112,7 +111,7 @@ TEXT ·initializeArm64(SB), 7, $0
VREV64 V1.S4, V3.S4
VREV64 V2.S4, V4.S4

MOVDconstants(SB), R3
MOVDasmConstants(SB), R3
VLD1 (R3), [V5.S4, V6.S4, V7.S4, V8.S4]
VEOR V5.B16, V1.B16, V1.B16
VEOR V6.B16, V2.B16, V2.B16
Expand All @@ -123,8 +122,7 @@ TEXT ·initializeArm64(SB), 7, $0
VST1 [V5.D2, V6.D2, V7.D2, V8.D2], (R0)
RET


TEXT ·updateArm64(SB), 7, $0
TEXT ·updateArm64(SB), NOSPLIT, $0
MOVD state+0(FP), R0
MOVD msg_base+8(FP), R1
MOVD msg_len+16(FP), R2 // length of message
Expand All @@ -142,7 +140,7 @@ TEXT ·updateArm64(SB), 7, $0
// v7 = mul1.hi

// Load zipper merge constants table pointer
MOVDzipperMerge(SB), R3
MOVDasmZipperMerge(SB), R3

// and load zipper merge constants into v28, v29, and v30
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
Expand All @@ -167,15 +165,14 @@ loop:
complete:
RET


// func finalizeArm64(out []byte, state *[16]uint64)
TEXT ·finalizeArm64(SB), 4, $0-32
TEXT ·finalizeArm64(SB), NOSPLIT, $0-32
MOVD state+24(FP), R0
MOVD out_base+0(FP), R1
MOVD out_len+8(FP), R2

// Load zipper merge constants table pointer
MOVDzipperMerge(SB), R3
MOVDasmZipperMerge(SB), R3

// and load zipper merge constants into v28, v29, and v30
VLD1 (R3), [V28.B16, V29.B16, V30.B16]
Expand All @@ -200,8 +197,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
VREV64 V0.S4, V27.S4
UPDATE(V26, V27)

CMP $8, R2
BEQ skipUpdate // Just 4 rounds for 64-bit checksum
CMP $8, R2
BEQ skipUpdate // Just 4 rounds for 64-bit checksum

VREV64 V1.S4, V26.S4
VREV64 V0.S4, V27.S4
Expand All @@ -211,8 +208,8 @@ TEXT ·finalizeArm64(SB), 4, $0-32
VREV64 V0.S4, V27.S4
UPDATE(V26, V27)

CMP $16, R2
BEQ skipUpdate // 6 rounds for 128-bit checksum
CMP $16, R2
BEQ skipUpdate // 6 rounds for 128-bit checksum

VREV64 V1.S4, V26.S4
VREV64 V0.S4, V27.S4
Expand Down Expand Up @@ -282,16 +279,16 @@ hash128:
MOVD 1*8(R0), R9
MOVD 6*8(R0), R10
MOVD 7*8(R0), R11
ADD R10, R8
ADD R11, R9
ADD R10, R8
ADD R11, R9
MOVD 8*8(R0), R10
MOVD 9*8(R0), R11
ADD R10, R8
ADD R11, R9
ADD R10, R8
ADD R11, R9
MOVD 14*8(R0), R10
MOVD 15*8(R0), R11
ADD R10, R8
ADD R11, R9
ADD R10, R8
ADD R11, R9
MOVD R8, 0(R1)
MOVD R9, 8(R1)
RET
Expand All @@ -307,22 +304,21 @@ hash64:
MOVD R4, (R1)
RET


DATA ·constants+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
DATA ·constants+0x08(SB)/8, $0xa4093822299f31d0
DATA ·constants+0x10(SB)/8, $0x13198a2e03707344
DATA ·constants+0x18(SB)/8, $0x243f6a8885a308d3
DATA ·constants+0x20(SB)/8, $0x3bd39e10cb0ef593
DATA ·constants+0x28(SB)/8, $0xc0acf169b5f18a8c
DATA ·constants+0x30(SB)/8, $0xbe5466cf34e90c6c
DATA ·constants+0x38(SB)/8, $0x452821e638d01377
GLOBL ·constants(SB), 8, $64
DATA ·asmConstants+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
DATA ·asmConstants+0x08(SB)/8, $0xa4093822299f31d0
DATA ·asmConstants+0x10(SB)/8, $0x13198a2e03707344
DATA ·asmConstants+0x18(SB)/8, $0x243f6a8885a308d3
DATA ·asmConstants+0x20(SB)/8, $0x3bd39e10cb0ef593
DATA ·asmConstants+0x28(SB)/8, $0xc0acf169b5f18a8c
DATA ·asmConstants+0x30(SB)/8, $0xbe5466cf34e90c6c
DATA ·asmConstants+0x38(SB)/8, $0x452821e638d01377
GLOBL ·asmConstants(SB), 8, $64

// Constants for TBL instructions
DATA ·zipperMerge+0x0(SB)/8, $0x000f010e05020c03 // zipper merge constant
DATA ·zipperMerge+0x8(SB)/8, $0x070806090d0a040b
DATA ·zipperMerge+0x10(SB)/8, $0x0f0e0d0c07060504 // setup first register for multiply
DATA ·zipperMerge+0x18(SB)/8, $0x1f1e1d1c17161514
DATA ·zipperMerge+0x20(SB)/8, $0x0b0a090803020100 // setup second register for multiply
DATA ·zipperMerge+0x28(SB)/8, $0x1b1a191813121110
GLOBL ·zipperMerge(SB), 8, $48
DATA ·asmZipperMerge+0x0(SB)/8, $0x000f010e05020c03 // zipper merge constant
DATA ·asmZipperMerge+0x8(SB)/8, $0x070806090d0a040b
DATA ·asmZipperMerge+0x10(SB)/8, $0x0f0e0d0c07060504 // setup first register for multiply
DATA ·asmZipperMerge+0x18(SB)/8, $0x1f1e1d1c17161514
DATA ·asmZipperMerge+0x20(SB)/8, $0x0b0a090803020100 // setup second register for multiply
DATA ·asmZipperMerge+0x28(SB)/8, $0x1b1a191813121110
GLOBL ·asmZipperMerge(SB), 8, $48
14 changes: 7 additions & 7 deletions highwayhash_ppc64le.s
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ TEXT ·updatePpc64Le(SB), NOFRAME|NOSPLIT, $0-32
XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO
XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI

// Load constants table pointer
MOVDconstants(SB), CONSTANTS
// Load asmConstants table pointer
MOVDasmConstants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(P1), MASK
XXLNAND MASK, MASK, MASK
Expand Down Expand Up @@ -174,9 +174,9 @@ complete:
RET

// Constants table
DATA ·constants+0x0(SB)/8, $0x0000000000000020
DATA ·constants+0x8(SB)/8, $0x0000000000000020
DATA ·constants+0x10(SB)/8, $0x070806090d0a040b // zipper merge constant
DATA ·constants+0x18(SB)/8, $0x000f010e05020c03 // zipper merge constant
DATA ·asmConstants+0x0(SB)/8, $0x0000000000000020
DATA ·asmConstants+0x8(SB)/8, $0x0000000000000020
DATA ·asmConstants+0x10(SB)/8, $0x070806090d0a040b // zipper merge constant
DATA ·asmConstants+0x18(SB)/8, $0x000f010e05020c03 // zipper merge constant

GLOBL ·constants(SB), 8, $32
GLOBL ·asmConstants(SB), 8, $32
2 changes: 1 addition & 1 deletion highwayhash_ref.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build noasm !amd64,!ppc64le
// +build noasm !amd64,!arm64,!ppc64le

package highwayhash

Expand Down

1 comment on commit 08ce0b4

@f00stx

This comment was marked as spam.

Please sign in to comment.