cloudflare · armfazh · Jun 2, 2020 · May 23, 2020 · May 27, 2020 · May 27, 2020
diff --git a/.github/actions/golangci-lint/Dockerfile b/.github/actions/golangci-lint/Dockerfile
diff --git a/.github/actions/golangci-lint/action.yml b/.github/actions/golangci-lint/action.yml
diff --git a/.github/workflows/ci-actions.yml b/.github/workflows/ci-actions.yml
@@ -8,7 +8,7 @@ on:
       - master
 jobs:
   amd64_job:
-    name: amd64/Go-${{matrix.GOVER}}
+    name: Go-${{matrix.GOVER}}/amd64
     runs-on: ubuntu-18.04
     strategy:
       matrix:
@@ -17,9 +17,10 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Linting
-        uses: ./.github/actions/golangci-lint
+        uses: golangci/golangci-lint-action@v1
         with:
-          args: run --config=.etc/golangci.yml ./...
+          version: v1.25
+          args: --config=./.etc/golangci.yml ./...
       - name: Setup Go-${{ matrix.GOVER }}
         uses: actions/setup-go@v2
         with:
@@ -32,22 +33,26 @@ jobs:
         run: go build -v ./...
       - name: Testing
         run: go test -v ./...
-  arm64_job:
+  exotic_job:
+    name: Go-${{matrix.CFG[2]}}/${{matrix.CFG[0]}}
     runs-on: ubuntu-18.04
-    name: arm64/Go-1.14
-    env:
-        GOVER: 1.14
-        SHA256: sha256:943fa6421fe7ca2b9fa40db22a5c14f99ed95afd0c18f9b7dec1e05b9ffce804
+    strategy:
+      matrix:
+        CFG: [ [s390x,s390x,1.14], [arm64,arm64v8,1.14] ]
     steps:
       - uses: actions/checkout@v2
+      - name: Enabling Docker Experimental
+        run: |
+          echo $'{\n    "experimental": true\n}' | sudo tee /etc/docker/daemon.json
+          sudo service docker restart
       - name: Pulling Images
         run: |
           docker pull -q multiarch/qemu-user-static
-          docker pull -q arm64v8/golang@$SHA256
+          docker pull -q --platform linux/${{matrix.CFG[0]}} ${{matrix.CFG[1]}}/golang:${{matrix.CFG[2]}}
+          docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
       - name: Testing
         run: |
-          docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-          docker run --rm -v `pwd`:`pwd` -w `pwd` arm64v8/golang@$SHA256 go test -v ./...
+          docker run --rm -e "GODEBUG=asyncpreemptoff=1" -v `pwd`:`pwd` -w `pwd` ${{matrix.CFG[1]}}/golang:${{matrix.CFG[2]}} go test -v ./...
   coverage_amd64_job:
     needs: [ amd64_job ]
     if: github.event_name == 'push'

diff --git a/dh/csidh/csidh_test.go b/dh/csidh/csidh_test.go
@@ -3,6 +3,7 @@ package csidh
 import (
 	"bytes"
 	"crypto/rand"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"os"
@@ -246,14 +247,14 @@ func TestKAT(t *testing.T) {
 	if err != nil {
 		t.Fatal(err.Error())
 	}
-	// Loop over all test cases
-	for i := range tests.Vectors {
-		if !hasADXandBMI2 && i >= numIter {
-			// The algorithm is relatively slow, so on slow systems test
-			// against smaller number of test vectors (otherwise CI may break)
-			return
-		}
-		test := tests.Vectors[i]
+	// Loop over numIter test cases
+	// The algorithm is relatively slow, so it tests a smaller number.
+	N := len(tests.Vectors)
+	var buf [2]byte
+	for i := 0; i < numIter; i++ {
+		_, _ = rand.Read(buf[:])
+		idx := binary.LittleEndian.Uint16(buf[:]) % uint16(N)
+		test := tests.Vectors[idx]
 		switch test.Status {
 		case StatusValues[Valid]:
 			checkSharedSecret(&test, t, Valid)

diff --git a/dh/csidh/fp511.go b/dh/csidh/fp511.go
@@ -2,19 +2,6 @@ package csidh
 
 import (
 	"math/bits"
-
-	"golang.org/x/sys/cpu"
-)
-
-// CPU Capabilities. Those flags are referred by assembly code. According to
-// https://github.com/golang/go/issues/28230, variables referred from the
-// assembly must be in the same package.
-// We declare variables not constants, in order to facilitate testing.
-var (
-	// Signals support for BMI2 (MULX)
-	hasBMI2 = cpu.X86.HasBMI2 //nolint
-	// Signals support for ADX and BMI2
-	hasADXandBMI2 = cpu.X86.HasBMI2 && cpu.X86.HasADX
 )
 
 // Constant time select.
@@ -35,65 +22,6 @@ func ctIsNonZero64(i uint64) int {
 	return int((i | (^(i - 1))) >> 63)
 }
 
-func mulGeneric(r, x, y *fp) {
-	var s fp // keeps intermediate results
-	var t1, t2 [9]uint64
-	var c, q uint64
-
-	for i := 0; i < numWords-1; i++ {
-		q = ((x[i] * y[0]) + s[0]) * pNegInv[0]
-		mul576(&t1, &p, q)
-		mul576(&t2, y, x[i])
-
-		// x[i]*y + q_i*p
-		t1[0], c = bits.Add64(t1[0], t2[0], 0)
-		t1[1], c = bits.Add64(t1[1], t2[1], c)
-		t1[2], c = bits.Add64(t1[2], t2[2], c)
-		t1[3], c = bits.Add64(t1[3], t2[3], c)
-		t1[4], c = bits.Add64(t1[4], t2[4], c)
-		t1[5], c = bits.Add64(t1[5], t2[5], c)
-		t1[6], c = bits.Add64(t1[6], t2[6], c)
-		t1[7], c = bits.Add64(t1[7], t2[7], c)
-		t1[8], _ = bits.Add64(t1[8], t2[8], c)
-
-		// s = (s + x[i]*y + q_i * p) / R
-		_, c = bits.Add64(t1[0], s[0], 0)
-		s[0], c = bits.Add64(t1[1], s[1], c)
-		s[1], c = bits.Add64(t1[2], s[2], c)
-		s[2], c = bits.Add64(t1[3], s[3], c)
-		s[3], c = bits.Add64(t1[4], s[4], c)
-		s[4], c = bits.Add64(t1[5], s[5], c)
-		s[5], c = bits.Add64(t1[6], s[6], c)
-		s[6], c = bits.Add64(t1[7], s[7], c)
-		s[7], _ = bits.Add64(t1[8], 0, c)
-	}
-
-	// last iteration stores result in r
-	q = ((x[numWords-1] * y[0]) + s[0]) * pNegInv[0]
-	mul576(&t1, &p, q)
-	mul576(&t2, y, x[numWords-1])
-
-	t1[0], c = bits.Add64(t1[0], t2[0], c)
-	t1[1], c = bits.Add64(t1[1], t2[1], c)
-	t1[2], c = bits.Add64(t1[2], t2[2], c)
-	t1[3], c = bits.Add64(t1[3], t2[3], c)
-	t1[4], c = bits.Add64(t1[4], t2[4], c)
-	t1[5], c = bits.Add64(t1[5], t2[5], c)
-	t1[6], c = bits.Add64(t1[6], t2[6], c)
-	t1[7], c = bits.Add64(t1[7], t2[7], c)
-	t1[8], _ = bits.Add64(t1[8], t2[8], c)
-
-	_, c = bits.Add64(t1[0], s[0], 0)
-	r[0], c = bits.Add64(t1[1], s[1], c)
-	r[1], c = bits.Add64(t1[2], s[2], c)
-	r[2], c = bits.Add64(t1[3], s[3], c)
-	r[3], c = bits.Add64(t1[4], s[4], c)
-	r[4], c = bits.Add64(t1[5], s[5], c)
-	r[5], c = bits.Add64(t1[6], s[6], c)
-	r[6], c = bits.Add64(t1[7], s[7], c)
-	r[7], _ = bits.Add64(t1[8], 0, c)
-}
-
 // Returns result of x<y operation.
 func isLess(x, y *fp) bool {
 	for i := numWords - 1; i >= 0; i-- {

diff --git a/dh/csidh/fp511_amd64.go b/dh/csidh/fp511_amd64.go
@@ -2,23 +2,37 @@
 
 package csidh
 
-import "math/bits"
+import (
+	"math/bits"
 
-//go:noescape
-func mul512(a, b *fp, c uint64)
+	"golang.org/x/sys/cpu"
+)
+
+var (
+	// Signals support for BMI2 (MULX)
+	hasBMI2 = cpu.X86.HasBMI2
+	// Signals support for ADX and BMI2
+	hasADXandBMI2 = cpu.X86.HasBMI2 && cpu.X86.HasADX
+)
+
+var _ = hasBMI2
+
+func mul512(r, m1 *fp, m2 uint64)     { mul512Amd64(r, m1, m2) }
+func cswap512(x, y *fp, choice uint8) { cswap512Amd64(x, y, choice) }
+func mulRdc(r, x, y *fp)              { mulRdcAmd64(r, x, y) }
 
 //go:noescape
-func mul576(a *[9]uint64, b *fp, c uint64)
+func mul512Amd64(a, b *fp, c uint64)
 
 //go:noescape
-func cswap512(x, y *fp, choice uint8)
+func cswap512Amd64(x, y *fp, choice uint8)
 
 //go:noescape
 func mulBmiAsm(res, x, y *fp)
 
 // mulRdc performs montgomery multiplication r = x * y mod P.
 // Returned result r is already reduced and in Montgomery domain.
-func mulRdc(r, x, y *fp) {
+func mulRdcAmd64(r, x, y *fp) {
 	var t fp
 	var c uint64
 

diff --git a/dh/csidh/fp511_amd64.s b/dh/csidh/fp511_amd64.s
@@ -1,4 +1,4 @@
-// +build amd64,!noasm
+// +build amd64
 
 #include "textflag.h"
 
@@ -9,8 +9,8 @@
 //
 // Registers used: AX, CX, DX, SI, DI, R8
 //
-// func mul512(a, b *Fp, c uint64)
-TEXT ·mul512(SB), NOSPLIT, $0-24
+// func mul512Amd64(a, b *Fp, c uint64)
+TEXT ·mul512Amd64(SB), NOSPLIT, $0-24
     MOVQ    a+0(FP), DI    // result
     MOVQ    b+8(FP), SI    // multiplicand
 
@@ -42,33 +42,7 @@ mul512_mulx:
     MULXQ   56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
     RET
 
-// Multipies 512-bit value by 64-bit value and returns 576-bit result. Uses MULQ instruction to
-// multiply 2 64-bit values. Returns 576-bit result.
-//
-// Result: x = (y * z)
-//
-// Registers used: AX, CX, DX, SI, DI, R8
-//
-// func mul576(a, b *Fp, c uint64)
-TEXT ·mul576(SB), NOSPLIT, $0-24
-    MOVQ    a+0(FP), DI    // result
-    MOVQ    b+8(FP), SI    // multiplicand
-
-    MOVQ c+16(FP), R10  // 64 bit multiplier, used by MULQ
-    MOVQ R10, AX; MULQ  0(SI);                            MOVQ DX, R11; MOVQ AX,  0(DI) //x[0]
-    MOVQ R10, AX; MULQ  8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX,  8(DI) //x[1]
-    MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
-    MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
-    MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
-    MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
-    MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
-    MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; ADCQ $0, DX;               MOVQ AX, 56(DI) //x[7]
-    MOVQ DX, 64(DI)                                                                     //x[8]
-
-    RET
-
-
-TEXT ·cswap512(SB),NOSPLIT,$0-17
+TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17
     MOVQ    x+0(FP), DI
     MOVQ    y+8(FP), SI
     MOVBLZX choice+16(FP), AX       // AL = 0 or 1

diff --git a/dh/csidh/fp511_generic.go b/dh/csidh/fp511_generic.go
@@ -1,13 +1,11 @@
-// +build noasm arm64
-
 package csidh
 
 import "math/bits"
 
 // mul576 implements schoolbook multiplication of
 // 64x512-bit integer. Returns result modulo 2^512.
-// r = m1*m2
-func mul512(r, m1 *fp, m2 uint64) {
+// r = m1*m2.
+func mul512Generic(r, m1 *fp, m2 uint64) {
 	var c, h, l uint64
 
 	c, r[0] = bits.Mul64(m2, m1[0])
@@ -43,8 +41,8 @@ func mul512(r, m1 *fp, m2 uint64) {
 // mul576 implements schoolbook multiplication of
 // 64x512-bit integer. Returns 576-bit result of
 // multiplication.
-// r = m1*m2
-func mul576(r *[9]uint64, m1 *fp, m2 uint64) {
+// r = m1*m2.
+func mul576Generic(r *[9]uint64, m1 *fp, m2 uint64) {
 	var c, h, l uint64
 
 	c, r[0] = bits.Mul64(m2, m1[0])
@@ -82,7 +80,7 @@ func mul576(r *[9]uint64, m1 *fp, m2 uint64) {
 // cswap512 implements constant time swap operation.
 // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
 // If choice is neither 0 nor 1 then behaviour is undefined.
-func cswap512(x, y *fp, choice uint8) {
+func cswap512Generic(x, y *fp, choice uint8) {
 	var tmp uint64
 	mask64 := 0 - uint64(choice)
 
@@ -95,7 +93,7 @@ func cswap512(x, y *fp, choice uint8) {
 
 // mulRdc performs montgomery multiplication r = x * y mod P.
 // Returned result r is already reduced and in Montgomery domain.
-func mulRdc(r, x, y *fp) {
+func mulRdcGeneric(r, x, y *fp) {
 	var t fp
 	var c uint64
 
@@ -111,7 +109,7 @@ func mulRdc(r, x, y *fp) {
 	t[6], c = bits.Sub64(r[6], p[6], c)
 	t[7], c = bits.Sub64(r[7], p[7], c)
 
-	var w = uint64(0 - uint64(c))
+	w := 0 - c
 	r[0] = ctPick64(w, r[0], t[0])
 	r[1] = ctPick64(w, r[1], t[1])
 	r[2] = ctPick64(w, r[2], t[2])
@@ -121,3 +119,62 @@ func mulRdc(r, x, y *fp) {
 	r[6] = ctPick64(w, r[6], t[6])
 	r[7] = ctPick64(w, r[7], t[7])
 }
+
+func mulGeneric(r, x, y *fp) {
+	var s fp // keeps intermediate results
+	var t1, t2 [9]uint64
+	var c, q uint64
+
+	for i := 0; i < numWords-1; i++ {
+		q = ((x[i] * y[0]) + s[0]) * pNegInv[0]
+		mul576Generic(&t1, &p, q)
+		mul576Generic(&t2, y, x[i])
+
+		// x[i]*y + q_i*p
+		t1[0], c = bits.Add64(t1[0], t2[0], 0)
+		t1[1], c = bits.Add64(t1[1], t2[1], c)
+		t1[2], c = bits.Add64(t1[2], t2[2], c)
+		t1[3], c = bits.Add64(t1[3], t2[3], c)
+		t1[4], c = bits.Add64(t1[4], t2[4], c)
+		t1[5], c = bits.Add64(t1[5], t2[5], c)
+		t1[6], c = bits.Add64(t1[6], t2[6], c)
+		t1[7], c = bits.Add64(t1[7], t2[7], c)
+		t1[8], _ = bits.Add64(t1[8], t2[8], c)
+
+		// s = (s + x[i]*y + q_i * p) / R
+		_, c = bits.Add64(t1[0], s[0], 0)
+		s[0], c = bits.Add64(t1[1], s[1], c)
+		s[1], c = bits.Add64(t1[2], s[2], c)
+		s[2], c = bits.Add64(t1[3], s[3], c)
+		s[3], c = bits.Add64(t1[4], s[4], c)
+		s[4], c = bits.Add64(t1[5], s[5], c)
+		s[5], c = bits.Add64(t1[6], s[6], c)
+		s[6], c = bits.Add64(t1[7], s[7], c)
+		s[7], _ = bits.Add64(t1[8], 0, c)
+	}
+
+	// last iteration stores result in r
+	q = ((x[numWords-1] * y[0]) + s[0]) * pNegInv[0]
+	mul576Generic(&t1, &p, q)
+	mul576Generic(&t2, y, x[numWords-1])
+
+	t1[0], c = bits.Add64(t1[0], t2[0], c)
+	t1[1], c = bits.Add64(t1[1], t2[1], c)
+	t1[2], c = bits.Add64(t1[2], t2[2], c)
+	t1[3], c = bits.Add64(t1[3], t2[3], c)
+	t1[4], c = bits.Add64(t1[4], t2[4], c)
+	t1[5], c = bits.Add64(t1[5], t2[5], c)
+	t1[6], c = bits.Add64(t1[6], t2[6], c)
+	t1[7], c = bits.Add64(t1[7], t2[7], c)
+	t1[8], _ = bits.Add64(t1[8], t2[8], c)
+
+	_, c = bits.Add64(t1[0], s[0], 0)
+	r[0], c = bits.Add64(t1[1], s[1], c)
+	r[1], c = bits.Add64(t1[2], s[2], c)
+	r[2], c = bits.Add64(t1[3], s[3], c)
+	r[3], c = bits.Add64(t1[4], s[4], c)
+	r[4], c = bits.Add64(t1[5], s[5], c)
+	r[5], c = bits.Add64(t1[6], s[6], c)
+	r[6], c = bits.Add64(t1[7], s[7], c)
+	r[7], _ = bits.Add64(t1[8], 0, c)
+}